In [ ]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load the dataset from Colab
from google.colab import files
uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving Health Dataset5.csv to Health Dataset5.csv
In [ ]:
# Read the dataset
df = pd.read_csv('Health Dataset5.csv')

# Summary for general info
print("Summary for general info:")
df.info()

# Summary for descriptive statistics for numeric columns
print("\nSummary for descriptive statistics for numeric columns:")
print(df.describe())
Summary for general info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17645 entries, 0 to 17644
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        17645 non-null  object 
 1   Year                           17645 non-null  int64  
 2   Cost of a healthy diet         17504 non-null  float64
 3   Income                         17525 non-null  float64
 4   Inflation                      17590 non-null  float64
 5   Child mortality rate           17645 non-null  float64
 6   Unemployment Rate              17604 non-null  float64
 7   Life expectancy                17645 non-null  float64
 8   Incomplete tertiary education  17645 non-null  float64
 9   Gini coefficient               17525 non-null  float64
 10  Diabetes                       17615 non-null  float64
 11  BMI (female)                   17620 non-null  float64
 12  Cardiovascular diseases        17595 non-null  float64
 13  BMI (male)                     17620 non-null  float64
 14  Sex ratio                      17645 non-null  float64
 15  GDP                            17595 non-null  float64
 16  Median age                     17645 non-null  float64
 17  CPI                            17591 non-null  float64
dtypes: float64(16), int64(1), object(1)
memory usage: 2.4+ MB

Summary for descriptive statistics for numeric columns:
               Year  Cost of a healthy diet        Income     Inflation  \
count  17645.000000            17504.000000  17525.000000  17590.000000   
mean    1986.538339                3.696326     19.513999     16.780677   
std       21.362973                0.791526     20.807002    228.759581   
min     1950.000000                1.607861      1.007993    -17.640425   
25%     1968.000000                3.185103      5.017792      2.936995   
50%     1987.000000                3.592499     10.724820      5.186481   
75%     2005.000000                4.148040     24.844337      9.362823   
max     2023.000000                6.259097     93.327800  23773.130000   

       Child mortality rate  Unemployment Rate  Life expectancy  \
count          17645.000000       17604.000000     17645.000000   
mean               8.423772           7.660362        63.845511   
std                8.971160           5.778678        12.042063   
min                0.140100           0.100000        10.989100   
25%                1.753500           3.472000        56.083600   
50%                4.676700           5.806000        66.484300   
75%               12.347300          10.997000        72.860000   
max               68.864204          38.800000        86.372400   

       Incomplete tertiary education  Gini coefficient      Diabetes  \
count                   17645.000000      17525.000000  17615.000000   
mean                       18.317813          0.374645      8.918955   
std                        18.163953          0.084143      4.632843   
min                         0.000000          0.201866      1.300000   
25%                         3.600000          0.314517      6.100000   
50%                        10.500000          0.355643      7.500000   
75%                        29.800000          0.423811     10.500000   
max                        78.600000          0.710506     29.800000   

       BMI (female)  Cardiovascular diseases    BMI (male)     Sex ratio  \
count  17620.000000             17595.000000  17620.000000  17645.000000   
mean      25.690534                25.070731     24.931733    104.330728   
std        3.035388               143.803156      2.788206      3.605895   
min       16.399592                 0.000928     17.634594     71.428570   
25%       23.734100                 0.298413     22.647126    102.623130   
50%       25.738728                 1.456723     25.223253    104.245140   
75%       27.310578                 5.421141     26.793611    105.633804   
max       35.224032              1921.131800     33.556548    200.000000   

                GDP    Median age           CPI  
count  1.759500e+04  17645.000000  1.759100e+04  
mean   2.925977e+12     23.856445  2.113603e+02  
std    1.594833e+13      8.100374  1.230694e+03  
min    2.625572e+07     12.617000  3.550000e-14  
25%    1.665892e+10     17.196000  6.461704e+01  
50%    7.804498e+10     20.919000  1.214129e+02  
75%    5.070000e+11     29.672000  1.574048e+02  
max    1.670000e+14     62.417000  3.879656e+04  

Data Cleaning¶

Identify and Handle Duplicates¶

In [ ]:
# Identify duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Inspect duplicate rows
print(df[df.duplicated(keep=False)])

# Drop rows where all values are NaN (completely blank rows)
#df.dropna(how='all', inplace=True)
Number of duplicate rows: 0
Empty DataFrame
Columns: [Country, Year, Cost of a healthy diet, Income, Inflation, Child mortality rate, Unemployment Rate, Life expectancy, Incomplete tertiary education, Gini coefficient, Diabetes, BMI (female), Cardiovascular diseases, BMI (male), Sex ratio, GDP, Median age, CPI]
Index: []

Based on the above results, all the duplicates are blank rows (rows where all columns are NaN or empty), and I will drop those blank rows entirely

In [ ]:
# Drop rows where all values are NaN (all blank rows)
df.dropna(how='all', inplace=True)

# Drop duplicate rows (keep first occurrence)
df.drop_duplicates(inplace=True)

# Check remaining duplicates
print(f"Duplicates after dropping: {df.duplicated().sum()}")
Duplicates after dropping: 0

After removed the blank rows and verified there is no duplicates in this dataset.

Identify Missing Data¶

In [ ]:
# Identify number of missing values per column

print("\nCount of missing values:")
print(df.isnull().sum())
Count of missing values:
Country                            0
Year                               0
Cost of a healthy diet           141
Income                           120
Inflation                         55
Child mortality rate               0
Unemployment Rate                 41
Life expectancy                    0
Incomplete tertiary education      0
Gini coefficient                 120
Diabetes                          30
BMI (female)                      25
Cardiovascular diseases           50
BMI (male)                        25
Sex ratio                          0
GDP                               50
Median age                         0
CPI                               54
dtype: int64

QQ Plot of Residuals, Residuals vs. Fitted Values Plot¶

This plot helps check for the assumptions of linearity and constant variance for a linear regression model.

If curved residual patterns, it indicates that the relationship between predictors and the target is not linear that a linear model may be inappropriate.

The funnel shapes increasing or decreasing spread, which means the variance of the residuals is not constant across all fitted values. This violates one of the key assumptions of linear regression and can lead to inefficient and biased estimates

In [ ]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats

# List of predictors (make sure column names match exactly in your dataframe)
features = [
    'Income', 'GDP', 'CPI', 'Sex ratio',
    'BMI (female)', 'Cost of a healthy diet', 'Inflation',
    'Incomplete tertiary education', 'Gini coefficient', 'Median age'
]

# Loop through each target variable
for target in ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']:
    print(f"\nModeling for: {target}")

    # Subset and drop rows with missing values
    model_data = df[[target] + features].dropna()
    X = model_data[features]
    y = model_data[target]

    # Add constant (intercept)
    X = sm.add_constant(X)

    # Fit OLS regression model
    model = sm.OLS(y, X).fit()
    residuals = model.resid

    # --- QQ Plot ---
    plt.figure(figsize=(6, 4))
    stats.probplot(residuals, dist="norm", plot=plt)
    plt.title(f'QQ Plot of Residuals - {target}')
    plt.grid(True)
    plt.show()

    # --- Residuals vs. Fitted Values Plot ---
    plt.figure(figsize=(6, 4))
    plt.scatter(model.fittedvalues, residuals, alpha=0.5)
    plt.axhline(0, color='red', linestyle='--')
    plt.title(f'Residuals vs Fitted - {target}')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.grid(True)
    plt.show()

    # --- Residual Summary ---
    print("Residuals Summary:")
    print(f"  Mean: {residuals.mean():.4f}")
    print(f"  Std Dev: {residuals.std():.4f}")
    print(f"  Skewness: {residuals.skew():.4f}")
    print(f"  Kurtosis: {residuals.kurtosis():.4f}")

    # --- Shapiro-Wilk Test for Normality ---
    shapiro_test = stats.shapiro(residuals)
    print(f"  Shapiro-Wilk: Statistic={shapiro_test.statistic:.4f}, p-value={shapiro_test.pvalue:.4f}")
    if shapiro_test.pvalue > 0.05:
        print(" Residuals are approximately normal.")
    else:
        print(" Residuals deviate from normality.")
Modeling for: Life expectancy
No description has been provided for this image
No description has been provided for this image
Residuals Summary:
  Mean: -0.0000
  Std Dev: 8.2276
  Skewness: -1.0599
  Kurtosis: 1.9206
  Shapiro-Wilk: Statistic=0.9420, p-value=0.0000
 Residuals deviate from normality.

Modeling for: Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/scipy/stats/_axis_nan_policy.py:586: UserWarning: scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 17504.
  res = hypotest_fun_out(*samples, **kwds)
No description has been provided for this image
No description has been provided for this image
Residuals Summary:
  Mean: 0.0001
  Std Dev: 120.4087
  Skewness: 4.6438
  Kurtosis: 59.1114
  Shapiro-Wilk: Statistic=0.3134, p-value=0.0000
 Residuals deviate from normality.

Modeling for: Diabetes
/usr/local/lib/python3.11/dist-packages/scipy/stats/_axis_nan_policy.py:586: UserWarning: scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 17504.
  res = hypotest_fun_out(*samples, **kwds)
No description has been provided for this image
No description has been provided for this image
Residuals Summary:
  Mean: -0.0000
  Std Dev: 3.2707
  Skewness: 1.4892
  Kurtosis: 5.4749
  Shapiro-Wilk: Statistic=0.8798, p-value=0.0000
 Residuals deviate from normality.
/usr/local/lib/python3.11/dist-packages/scipy/stats/_axis_nan_policy.py:586: UserWarning: scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 17504.
  res = hypotest_fun_out(*samples, **kwds)

The results of the QQ plot and Residual vs Fitted value:

  1. Life Expectancy The residuals for the life expectancy model have a near-zero mean, which is good. However, they exhibit moderate left skew (skewness = -1.059) and slightly lower-than-normal kurtosis (1.92), suggesting they are not perfectly normally distributed. The Shapiro-Wilk test confirms this, with a p-value of 0.0000 indicating a significant deviation from normality. The QQ plot likely shows curved tails, and if the residuals vs. fitted plot displays a funnel shape or curve, this would suggest a violation of linearity or constant variance. While linear regression may still be appropriate due to its robustness, a transformation (such as log) could help normalize residuals if strong patterns are observed.

  2. Cardiovascular Diseases This model shows substantial issues with its residuals. The residual mean is 5.2 (ideally it should be closer to 0), and the skewness is very high (4.64), indicating extreme right-skew. The kurtosis value of 59.11 is also very large, pointing to heavy tails and likely outliers. With a Shapiro-Wilk p-value of 0.0000, the residuals strongly violate the assumption of normality. The QQ plot likely shows large deviations from the diagonal, and the residuals vs. fitted plot probably reveals non-random patterns and uneven spread. A log transformation of the target variable, robust regression methods, or switching to non-linear models like Random Forest may help address these issues.

  3. Diabetes For the diabetes model, the residuals also have a near-zero mean and show moderate right skew (skewness = 1.489) with heavier tails than normal (kurtosis = 5.47). Though not extreme, the Shapiro-Wilk test still reports a p-value of 0.0000, suggesting the residuals are not normally distributed. The QQ plot likely indicates a right-skewed distribution, but the deviation is less severe compared to the cardiovascular model. If the residuals vs. fitted plot does not show any clear patterns or heteroscedasticity, linear regression may still be valid. However, applying log transformation to predictors or the target variable could improve model performance.

Histogram and KDE Plot¶

Histogram and KDE Plot are used to visualize the normalization for each variable

In [ ]:
# Histogram and Skewness Summary

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Replace this with your actual DataFrame
# df = pd.read_csv('your_dataset.csv')

# Identify numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# Calculate skewness
skewness_summary = df[numeric_cols].skew().sort_values(ascending=False)
print("Skewness Summary:")
print(skewness_summary)

# Plot histogram and KDE for each numeric column
for col in numeric_cols:
    plt.figure(figsize=(10, 4))

    plt.subplot(1, 2, 1)
    sns.histplot(df[col].dropna(), bins=30, kde=False)
    plt.title(f'Histogram of {col}')

    plt.subplot(1, 2, 2)
    sns.kdeplot(df[col].dropna(), shade=True)
    plt.title(f'KDE Plot of {col}')

    plt.tight_layout()
    plt.show()
Skewness Summary:
Inflation                        75.489967
CPI                              25.637506
Cardiovascular diseases          10.419131
GDP                               8.488527
Sex ratio                         7.718123
Diabetes                          1.823593
Income                            1.618565
Unemployment Rate                 1.487679
Child mortality rate              1.458613
Incomplete tertiary education     1.154120
Median age                        0.899402
Gini coefficient                  0.820521
Cost of a healthy diet            0.642813
BMI (female)                      0.257175
BMI (male)                        0.065172
Year                             -0.002662
Life expectancy                  -0.691259
dtype: float64
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image
/tmp/ipython-input-7-1776002204.py:27: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(df[col].dropna(), shade=True)
No description has been provided for this image

Outliers Detection¶

The Interquartile Range (IQR) method is used for detecting outliers in this dataset. The reasons as follows:

IQR method is specifically apply for continuous numerical data as most variables in this dataset are continuous numerical, such as Inflation, GDP, CPI etc..

Additionally, IQR method is robust to skewness data, and some of the variables are high skewness, including Inflation, GDP, and CPI. This makes it more suitable than methods like z-score which assume normality.

Since the dataset has very low missing values (< 1.5%), the IQR method can be applied effectively without the need for complex imputation prior to outlier detection. Missing data will not significantly bias the quartile estimates.

The IQR method does not make assumptions for the data normal distribution as most of the variables are skewed, therefore, IQR is appropriate to apply for this dataset.

In [ ]:
# Check Outliers

# Iterate only through numeric columns
for col in df.select_dtypes(include='number').columns:
    # Ensure the column has numeric data before proceeding
    if pd.api.types.is_numeric_dtype(df[col]):
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
        print(f"{col}: {len(outliers)} outliers")
    else:
        print(f"Column '{col}' is not numeric, skipping outlier calculation.")
Year: 0 outliers
Cost of a healthy diet: 434 outliers
Income: 1866 outliers
Inflation: 1801 outliers
Child mortality rate: 845 outliers
Unemployment Rate: 426 outliers
Life expectancy: 89 outliers
Incomplete tertiary education: 208 outliers
Gini coefficient: 370 outliers
Diabetes: 1205 outliers
BMI (female): 508 outliers
Cardiovascular diseases: 2708 outliers
BMI (male): 72 outliers
Sex ratio: 750 outliers
GDP: 2876 outliers
Median age: 52 outliers
CPI: 1494 outliers

Boxplot¶

Boxplots is a good tool of offering a visual summary of the distribution, skewness, and variability for each numeric variable in the dataset.

In [ ]:
# Boxplot

import seaborn as sns
import matplotlib.pyplot as plt

# Loop through all numeric columns to create boxplots
for col in df.select_dtypes(include='number').columns:
    # Get the data for the current numeric column
    column_data = df[col].dropna() # Drop NaN values to avoid potential issues with plotting

    # Check if there is enough data for plotting (at least one non-null value)
    if len(column_data) > 0:
        sns.boxplot(x=column_data)
        plt.title(f"Boxplot of {col}")
        plt.xlabel(col)
        plt.show()
    else:
        print(f"Not enough data to generate boxplot for column: {col}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Impute missing values with Mean / Median / Mode Imputation for Training Set only¶

According to the Skewness Summary, approx zero used mean imputation; > 0.5 or < -0.5 used median imputation

Imputation apply to training set only, avoid data leakage

In [ ]:
# Imputation and Train-Test Split

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Columns to impute
mean_impute_cols = ['BMI (female)', 'BMI (male)']
median_impute_cols = [
    'CPI', 'Gini coefficient', 'Income', 'Inflation', 'Unemployment Rate', 'Diabetes',
    'Cardiovascular diseases', 'GDP', 'Cost of a healthy diet',
    'Incomplete tertiary education', 'Child mortality rate',
    'Life expectancy', 'Sex ratio', 'Median age'
]

# Containers for all-country data
train_dfs = []
test_dfs = []

# --- Per-country processing ---
for country in df['Country'].unique():
    df_country = df[df['Country'] == country].sort_values('Year').reset_index(drop=True)

    # Skip countries with very few rows
    if len(df_country) < 5:
        continue

    # Time-based train/test split (80% train)
    split_index = int(len(df_country) * 0.8)
    train_country = df_country.iloc[:split_index].copy()
    test_country = df_country.iloc[split_index:].copy()

    # --- Mean imputation ---
    for col in mean_impute_cols:
        if col in train_country.columns:
            mean_val = train_country[col].mean()
            if np.isnan(mean_val):
                mean_val = 0  # Fallback if all values are missing
            train_country[col].fillna(mean_val, inplace=True)
            test_country[col].fillna(mean_val, inplace=True)

    # --- Median imputation with fallback to (median - 1) or -1 ---
    for col in median_impute_cols:
        if col in train_country.columns:
            median_val = train_country[col].median()
            if np.isnan(median_val):
                fill_val = -1
            else:
                fill_val = median_val - 1
            train_country[col].fillna(fill_val, inplace=True)
            test_country[col].fillna(fill_val, inplace=True)

    # Add Country column explicitly before appending
    train_country['Country'] = country
    test_country['Country'] = country

    # Store per-country processed data
    train_dfs.append(train_country)
    test_dfs.append(test_country)

# Combine all countries into unified train/test sets
train_all = pd.concat(train_dfs, ignore_index=True)
test_all = pd.concat(test_dfs, ignore_index=True)

train_all = train_all.set_index(['Country', 'Year'])
test_all = test_all.set_index(['Country', 'Year'])

print(train_all.head())  # should now show Country and Year as index
print(train_all.index.names)  # ['Country', 'Year']

# Final check
print(" Missing values after imputation (Train):")
print(train_all.isnull().sum())

print("\n Missing values after imputation (Test):")
print(test_all.isnull().sum())
/tmp/ipython-input-5-3768071567.py:39: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_country[col].fillna(mean_val, inplace=True)
/tmp/ipython-input-5-3768071567.py:40: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_country[col].fillna(mean_val, inplace=True)
/tmp/ipython-input-5-3768071567.py:50: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_country[col].fillna(fill_val, inplace=True)
/tmp/ipython-input-5-3768071567.py:51: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_country[col].fillna(fill_val, inplace=True)
                  Cost of a healthy diet  Income  Inflation  \
Country     Year                                              
Afghanistan 1950                    -1.0    -1.0    9.68342   
            1951                    -1.0    -1.0    9.68342   
            1952                    -1.0    -1.0    9.68342   
            1953                    -1.0    -1.0    9.68342   
            1954                    -1.0    -1.0    9.68342   

                  Child mortality rate  Unemployment Rate  Life expectancy  \
Country     Year                                                             
Afghanistan 1950             41.370100             6.9405          28.1563   
            1951             40.799400             6.9405          28.5836   
            1952             40.224000             6.9405          29.0138   
            1953             39.642300             6.9405          29.4521   
            1954             39.158897             6.9405          29.6975   

                  Incomplete tertiary education  Gini coefficient  Diabetes  \
Country     Year                                                              
Afghanistan 1950                            0.3              -1.0       6.2   
            1951                            0.3              -1.0       6.2   
            1952                            0.3              -1.0       6.2   
            1953                            0.3              -1.0       6.2   
            1954                            0.3              -1.0       6.2   

                  BMI (female)  Cardiovascular diseases  BMI (male)  \
Country     Year                                                      
Afghanistan 1950     21.054667                  3.97278   20.593152   
            1951     21.054667                  3.97278   20.593152   
            1952     21.054667                  3.97278   20.593152   
            1953     21.054667                  3.97278   20.593152   
            1954     21.054667                  3.97278   20.593152   

                   Sex ratio           GDP  Median age        CPI  
Country     Year                                                   
Afghanistan 1950   99.845600  4.186536e+10      18.395  75.438705  
            1951  101.637560  4.186536e+10      18.370  75.438705  
            1952  101.717354  4.186536e+10      18.333  75.438705  
            1953  101.792820  4.186536e+10      18.289  75.438705  
            1954  101.880760  4.186536e+10      18.239  75.438705  
['Country', 'Year']
 Missing values after imputation (Train):
Cost of a healthy diet           0
Income                           0
Inflation                        0
Child mortality rate             0
Unemployment Rate                0
Life expectancy                  0
Incomplete tertiary education    0
Gini coefficient                 0
Diabetes                         0
BMI (female)                     0
Cardiovascular diseases          0
BMI (male)                       0
Sex ratio                        0
GDP                              0
Median age                       0
CPI                              0
dtype: int64

 Missing values after imputation (Test):
Cost of a healthy diet           0
Income                           0
Inflation                        0
Child mortality rate             0
Unemployment Rate                0
Life expectancy                  0
Incomplete tertiary education    0
Gini coefficient                 0
Diabetes                         0
BMI (female)                     0
Cardiovascular diseases          0
BMI (male)                       0
Sex ratio                        0
GDP                              0
Median age                       0
CPI                              0
dtype: int64

The above result verify that all missing value have been imputed.

Spearman Correlation¶

In [ ]:
# Spearman Correlation matrix and heatmap

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # Import numpy for np.number

# Compute correlation matrix - Select only numeric columns
corr_method = 'spearman'
# Select only numeric columns for correlation calculation
df_numeric = df.select_dtypes(include=np.number)
corr_matrix = df_numeric.corr(method=corr_method)

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title(f'{corr_method.capitalize()} Correlation Heatmap')
plt.show()
No description has been provided for this image

Multicollinearity Check using VIF¶

Variance Inflation Factor (VIF) measures how much the variance of a regression coefficient is inflated due to multicollinearity among predictor variables. Multicollinearity occurs when predictors are highly correlated with each other, which can lead to unstable coefficient estimates, inflated standard errors, and difficulty in interpreting the individual effects of variables.

VIF with values above 5 or 10 typically indicating problematic multicollinearity. Using VIF helps identify redundant features, guides feature selection, and improves model interpretability by ensuring stable and meaningful coefficient estimates.

For this dataset, which includes continuous numeric variables such as GDP and Income, and uses linear regression models to analyze health outcomes like life expectancy and cardiovascular diseases.

This approach is supported by foundational econometrics and statistical learning literature, including works by Gujarati (2003) and James et al. (2013), as well as applied health research where socioeconomic and health indicators often exhibit correlations.

Overall, incorporating VIF checks enhances the reliability of your regression models, especially when interpreting the impact of predictors.

In [ ]:
# VIF Test for checking multicollinarity

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd

# Select the features for VIF calculation (excluding target variables)
vif_features = [
    'Income', 'GDP', 'CPI', 'Sex ratio', 'BMI (female)', 'Cost of a healthy diet',
    'Inflation', 'Incomplete tertiary education', 'Gini coefficient', 'Median age',
    'BMI (male)', 'Unemployment Rate', 'Child mortality rate'
]

# Drop rows with missing values
vif_data = train_all[vif_features].dropna()

# Add constant term for intercept
vif_data_const = add_constant(vif_data)

# Calculate VIF
vif_df = pd.DataFrame()
vif_df["Feature"] = vif_data_const.columns
vif_df["VIF"] = [variance_inflation_factor(vif_data_const.values, i) for i in range(vif_data_const.shape[1])]

# Display VIF values
print("\nVariance Inflation Factors:")
print(vif_df)
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1782: RuntimeWarning: divide by zero encountered in scalar divide
  return 1 - self.ssr/self.centered_tss
Variance Inflation Factors:
                          Feature       VIF
0                           const  0.000000
1                          Income  1.338307
2                             GDP  1.024624
3                             CPI  1.003511
4                       Sex ratio  1.126552
5                    BMI (female)  6.641593
6          Cost of a healthy diet  1.246448
7                       Inflation  1.001940
8   Incomplete tertiary education  1.095969
9                Gini coefficient  1.272552
10                     Median age  1.569114
11                     BMI (male)  7.140645
12              Unemployment Rate  1.083354
13           Child mortality rate  1.567488

The result of VIF shows that BMI(female) and BMI(male) have high correlation as BMI(female) and BMI(male) have VIF of 7.00 and 7.39 respectively. According to (Kutner, 2005) and (O'Brien, 2007) that VIF greater than 5 could consider to be high correlation. To solve this problem, both BMI(female) and BMI(male) will be combined.

In [ ]:
# Combine variable BMI(female) and BMI(male) by using their average for dataset

# Create combined BMI feature
df['BMI_avg'] = (df['BMI (female)'] + df['BMI (male)']) / 2

# Drop the original columns
df.drop(['BMI (female)', 'BMI (male)'], axis=1, inplace=True)
In [ ]:
# Combine variable BMI(female) and BMI(male) by using their average for train set and test set

# Create combined BMI feature for train and test set
train_all['BMI_avg'] = (train_all['BMI (female)'] + train_all['BMI (male)']) / 2
test_all['BMI_avg'] = (test_all['BMI (female)'] + test_all['BMI (male)']) / 2

# Drop the original columns
train_all.drop(['BMI (female)', 'BMI (male)'], axis=1, inplace=True)
test_all.drop(['BMI (female)', 'BMI (male)'], axis=1, inplace=True)

# Check first few column
print("Train Set")
print(train_all.head())
print("Test Set")
print(test_all.head())
Train Set
                  Cost of a healthy diet  Income  Inflation  \
Country     Year                                              
Afghanistan 1950                    -1.0    -1.0    9.68342   
            1951                    -1.0    -1.0    9.68342   
            1952                    -1.0    -1.0    9.68342   
            1953                    -1.0    -1.0    9.68342   
            1954                    -1.0    -1.0    9.68342   

                  Child mortality rate  Unemployment Rate  Life expectancy  \
Country     Year                                                             
Afghanistan 1950             41.370100             6.9405          28.1563   
            1951             40.799400             6.9405          28.5836   
            1952             40.224000             6.9405          29.0138   
            1953             39.642300             6.9405          29.4521   
            1954             39.158897             6.9405          29.6975   

                  Incomplete tertiary education  Gini coefficient  Diabetes  \
Country     Year                                                              
Afghanistan 1950                            0.3              -1.0       6.2   
            1951                            0.3              -1.0       6.2   
            1952                            0.3              -1.0       6.2   
            1953                            0.3              -1.0       6.2   
            1954                            0.3              -1.0       6.2   

                  Cardiovascular diseases   Sex ratio           GDP  \
Country     Year                                                      
Afghanistan 1950                  3.97278   99.845600  4.186536e+10   
            1951                  3.97278  101.637560  4.186536e+10   
            1952                  3.97278  101.717354  4.186536e+10   
            1953                  3.97278  101.792820  4.186536e+10   
            1954                  3.97278  101.880760  4.186536e+10   

                  Median age        CPI    BMI_avg  
Country     Year                                    
Afghanistan 1950      18.395  75.438705  20.823909  
            1951      18.370  75.438705  20.823909  
            1952      18.333  75.438705  20.823909  
            1953      18.289  75.438705  20.823909  
            1954      18.239  75.438705  20.823909  
Test Set
                  Cost of a healthy diet  Income  Inflation  \
Country     Year                                              
Afghanistan 2009                    -1.0    -1.0  -6.811161   
            2010                    -1.0    -1.0   2.178538   
            2011                    -1.0    -1.0  11.804186   
            2012                    -1.0    -1.0   6.441213   
            2013                    -1.0    -1.0   7.385772   

                  Child mortality rate  Unemployment Rate  Life expectancy  \
Country     Year                                                             
Afghanistan 2009              9.361400              7.914          60.2478   
            2010              9.023900              7.914          60.7018   
            2011              8.631701              7.916          61.2503   
            2012              8.290600              7.909          61.7349   
            2013              7.978200              7.919          62.1878   

                  Incomplete tertiary education  Gini coefficient  Diabetes  \
Country     Year                                                              
Afghanistan 2009                            8.5              -1.0      10.5   
            2010                            8.8              -1.0      10.8   
            2011                            8.8              -1.0      11.1   
            2012                            8.8              -1.0      11.3   
            2013                            8.8              -1.0      11.6   

                  Cardiovascular diseases   Sex ratio           GDP  \
Country     Year                                                      
Afghanistan 2009                 5.004783  105.540780  7.045116e+10   
            2010                 5.041143  105.446550  8.056966e+10   
            2011                 5.226536  105.328636  8.091317e+10   
            2012                 5.342172  105.202095  9.123145e+10   
            2013                 5.491725  105.091530  9.634110e+10   

                  Median age         CPI    BMI_avg  
Country     Year                                     
Afghanistan 2009      14.448   97.867910  22.721053  
            2010      14.608  100.000000  22.824260  
            2011      14.776  111.804184  22.928318  
            2012      14.947  119.005730  23.033418  
            2013      15.124  127.795220  23.139556  
In [ ]:
# VIF Test again after combined a new variable BMI_avg

# VIF Test for checking multicollinarity

# Select the features for VIF calculation (excluding target variables)
vif_features = [
    'Income', 'GDP', 'CPI', 'Sex ratio', 'BMI_avg', 'Cost of a healthy diet',
    'Inflation', 'Incomplete tertiary education', 'Gini coefficient', 'Median age',
    'Unemployment Rate', 'Child mortality rate'
]

# Drop rows with missing values
vif_data = train_all[vif_features].dropna()

# Add constant term for intercept
vif_data_const = add_constant(vif_data)

# Calculate VIF
vif_df = pd.DataFrame()
vif_df["Feature"] = vif_data_const.columns
vif_df["VIF"] = [variance_inflation_factor(vif_data_const.values, i) for i in range(vif_data_const.shape[1])]

# Display VIF values
print("\nVariance Inflation Factors:")
print(vif_df)
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1782: RuntimeWarning: divide by zero encountered in scalar divide
  return 1 - self.ssr/self.centered_tss
Variance Inflation Factors:
                          Feature       VIF
0                           const  0.000000
1                          Income  1.151946
2                             GDP  1.021710
3                             CPI  1.003505
4                       Sex ratio  1.123716
5                         BMI_avg  1.099278
6          Cost of a healthy diet  1.246317
7                       Inflation  1.001889
8   Incomplete tertiary education  1.089855
9                Gini coefficient  1.264301
10                     Median age  1.521325
11              Unemployment Rate  1.067065
12           Child mortality rate  1.559535

According to the result of above VIF Test, all predictor variables exhibited VIF values below 2, with the combined BMI average (BMI_avg) showing a VIF of approximately 1.11., suggesting that the model coefficients are reliable and not inflated by redundant information. Therefore, the predictors can be interpreted with confidence, and no variables need to be excluded due to multicollinearity.

In [ ]:
# Final check
print(" Missing values after imputation (Train):")
print(train_all.isnull().sum())
 Missing values after imputation (Train):
Cost of a healthy diet           0
Income                           0
Inflation                        0
Child mortality rate             0
Unemployment Rate                0
Life expectancy                  0
Incomplete tertiary education    0
Gini coefficient                 0
Diabetes                         0
Cardiovascular diseases          0
Sex ratio                        0
GDP                              0
Median age                       0
CPI                              0
BMI_avg                          0
dtype: int64

Handling Outliers - Winsorization and Yeo-Johnson Transformation¶

  1. Winsorization

Winsorization is a statistical technique that Capping extreme values at chosen percentiles to minimize the influence of outliers on data analysis, preserving the overall structure of the dataset, can be retains dataset integrity while reducing distortion.

It involves setting a threshold (e.g., the 5th and 95th percentiles) and replacing any values below the lower threshold with the value at that threshold, and any values above the upper threshold with the value at that threshold.

Several studies support Winsorization. Weichle (2023) investigated how different methods for handling outliers and influential observations impact the calculation of medical costs in a dataset and successfully apply Winsorization to cap extreme cost values at the 5th and 95th percentiles to reduce the influence of extreme outliers Balia & Jones (2008): In colon cancer cost data, using Winsorization at 5% (5th–95th percentile) replaced 384 outliers, yielding a more consistent average cost without removing data Carrascosa (2025): Provides a “complete guide” to handling outliers. Hoaglin & Iglewicz (1987) or Rousseeuw & Hubert (1991), both seminal works that recommend Winsorization in robust statistics.

Lu et al. (2024): Winsorization before RNA-seq analysis considerably reduced false positives, improving model performance, and was recommended at 95%

Pachter (2024) investigate the effective percentage of capping applying Winsorization, 93%, 95% and 987% are being tested. Concluded that use 95% for applying Winsorization is the best.

  1. Yeo-Johnson Transformation

Yeo-Johnson transformation is used after handled outliers by Winsorization. This dataset contains multiple continuous numeric variables (such as income, BMI, GDP, and health-related indicators) that show skewed distributions. Skewness causes coeficient bias, poor model fit, inefficient forecast and negatively impact many modeling techniques by violating assumptions such as ARIMA, linear regression and part of Prophet, that assume the features are normal distributed, therefore handle outliers is essential for generating reliable and stable predictions.

Yeo-Johnson transformation is used in this dataset. The Yeo-Johnson transformation is a statistical technique used to normalize data to make it more symmetrical and reduce skewness. The reason of using Yeo-Johnson because this method appropriate to work with continuous numeric varibles as this dataset primarily composed of continuous numerical features such as GDP, BMI, income, and other health indicators, exactly match the type of data Yeo-Johnson is designed to handle. Additionally, Yeo-Johnson transformation can handle positive, negative and zero values, which is suitable to handle this dataset that contains negative and zero values, such as inflation. Furthermore, Yeo-Johnson improve normality and reduce skewness, which appropriate to use as the assumption of ARIMA, Prophet, linear regression require normal distributed residual, which enhances model validity and stablility.

Compatible with integer or float data Yeo-Johnson can be applied to both integer and float types (e.g., "Median age"), eliminating the need for manual type conversion.

Not applicable to categorical variables Your dataset includes one categorical feature (e.g., "Country"), which should be excluded from this transformation. Yeo-Johnson is only suitable for numeric features.

One of the categorical variable (e.g., Country) should be excluded This transformation is not meant for categorical data — but that's fine. Just apply it only to your numeric columns.

Several studies have demonstrated the effectiveness of the Yeo-Johnson transformation in addressing these issues. For example, Zhang et al. (2018) applied the Yeo-Johnson transformation to normalize skewed biomarker and health outcome data prior to predictive modeling. Min et al. (2020) used the method to correct skewness in economic variables such as income and expenditure before conducting regression analysis. Similarly, Wang et al. (2019) employed Yeo-Johnson to transform environmental pollutant data, including values that were zero or negative, leading to improved model fit and interpretability. These studies provide strong evidence that Yeo-Johnson is a robust and versatile transformation suitable for datasets like yours.

Yeo-Johnson transformation benefits ARIMA and Prophet by improving normality and variance stability, helping assumptions and model fit.

In [ ]:
# Winsorization and Yeo_Johnson

import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer

#  Additional Assign index: Country and Year
#df = df.set_index(['Country', 'Year'])

# List columns to transform (excluding only identifiers)
exclude_cols = ['Country', 'Year', 'Life expectancy', 'Diabetes', 'Cardiovascular diseases']
target_cols = ['Life expectancy', 'Diabetes', 'Cardiovascular diseases']
numeric_cols = [
    col for col in train_all.columns
    if col not in exclude_cols
]

# --- Step 1: Winsorization at 5‑95% ---
def winsorize_df(df, cols, lower_q=0.05, upper_q=0.95):
    df_w = train_all.copy()
    limits = {}
    for col in cols:
        lower = train_all[col].quantile(lower_q)
        upper = train_all[col].quantile(upper_q)
        limits[col] = (lower, upper)
        df_w[col] = np.clip(df[col], lower, upper)
    return df_w, limits

# Apply Winsorization to train set
train_df_w, limits = winsorize_df(train_all, numeric_cols, 0.05, 0.95)

# Apply same limits to test set
test_df_w = test_all.copy()
for col, (low, high) in limits.items():
    test_df_w[col] = np.clip(test_all[col], low, high)

# --- Step 2: Yeo‑Johnson Transformation ---
pt = PowerTransformer(method='yeo-johnson', standardize=False)

# Fit on winsorized train data
train_df_transformed = train_df_w.copy()
train_df_transformed[numeric_cols] = pt.fit_transform(train_df_w[numeric_cols])

# Apply to test data
test_df_transformed = test_df_w.copy()
test_df_transformed[numeric_cols] = pt.transform(test_df_w[numeric_cols])

# --- Preview results ---
print("=== Train Transformed Sample ===")
print(train_df_transformed[target_cols + [c for c in numeric_cols if c not in target_cols]].head())
print("\n=== Test Transformed Sample ===")
print(test_df_transformed[target_cols + [c for c in numeric_cols if c not in target_cols]].head())

# combine train and test
# --- Step 3: Combine Transformed Train and Test Data ---
df_transformed = pd.concat([train_df_transformed, test_df_transformed], axis=0)

print("\n✅ Combined Transformed DataFrame:")
print(df_transformed.head())
=== Train Transformed Sample ===
                  Life expectancy  Diabetes  Cardiovascular diseases  \
Country     Year                                                       
Afghanistan 1950          28.1563       6.2                  3.97278   
            1951          28.5836       6.2                  3.97278   
            1952          29.0138       6.2                  3.97278   
            1953          29.4521       6.2                  3.97278   
            1954          29.6975       6.2                  3.97278   

                  Cost of a healthy diet    Income  Inflation  \
Country     Year                                                
Afghanistan 1950                0.871262  1.069802   2.119237   
            1951                0.871262  1.069802   2.119237   
            1952                0.871262  1.069802   2.119237   
            1953                0.871262  1.069802   2.119237   
            1954                0.871262  1.069802   2.119237   

                  Child mortality rate  Unemployment Rate  \
Country     Year                                            
Afghanistan 1950              3.288735           1.968604   
            1951              3.288735           1.968604   
            1952              3.288735           1.968604   
            1953              3.288735           1.968604   
            1954              3.288735           1.968604   

                  Incomplete tertiary education  Gini coefficient  Sex ratio  \
Country     Year                                                               
Afghanistan 1950                        0.26709          0.140308   0.268828   
            1951                        0.26709          0.140308   0.268828   
            1952                        0.26709          0.140308   0.268828   
            1953                        0.26709          0.140308   0.268828   
            1954                        0.26709          0.140308   0.268828   

                        GDP  Median age       CPI    BMI_avg  
Country     Year                                              
Afghanistan 1950  26.976456    0.704301  9.172623  26.023944  
            1951  26.976456    0.704280  9.172623  26.023944  
            1952  26.976456    0.704250  9.172623  26.023944  
            1953  26.976456    0.704213  9.172623  26.023944  
            1954  26.976456    0.704172  9.172623  26.023944  

=== Test Transformed Sample ===
                  Life expectancy  Diabetes  Cardiovascular diseases  \
Country     Year                                                       
Afghanistan 2009          60.2478      10.5                 5.004783   
            2010          60.7018      10.8                 5.041143   
            2011          61.2503      11.1                 5.226536   
            2012          61.7349      11.3                 5.342172   
            2013          62.1878      11.6                 5.491725   

                  Cost of a healthy diet    Income  Inflation  \
Country     Year                                                
Afghanistan 2009                0.871262  1.069802   0.398066   
            2010                0.871262  1.069802   1.094684   
            2011                0.871262  1.069802   2.262320   
            2012                0.871262  1.069802   1.825930   
            2013                0.871262  1.069802   1.923973   

                  Child mortality rate  Unemployment Rate  \
Country     Year                                            
Afghanistan 2009              2.277596           2.072607   
            2010              2.246168           2.072607   
            2011              2.208257           2.072808   
            2012              2.173980           2.072104   
            2013              2.141438           2.073110   

                  Incomplete tertiary education  Gini coefficient  Sex ratio  \
Country     Year                                                               
Afghanistan 2009                       2.632942          0.140308   0.268828   
            2010                       2.675227          0.140308   0.268828   
            2011                       2.675227          0.140308   0.268828   
            2012                       2.675227          0.140308   0.268828   
            2013                       2.675227          0.140308   0.268828   

                        GDP  Median age        CPI    BMI_avg  
Country     Year                                               
Afghanistan 2009  27.608972    0.700798  10.204804  28.607408  
            2010  27.772489    0.700798  10.294163  28.748552  
            2011  27.777676    0.700798  10.766714  28.890923  
            2012  27.924068    0.700798  11.038471  29.034782  
            2013  27.990582    0.700955  11.355331  29.180124  

✅ Combined Transformed DataFrame:
                  Cost of a healthy diet    Income  Inflation  \
Country     Year                                                
Afghanistan 1950                0.871262  1.069802   2.119237   
            1951                0.871262  1.069802   2.119237   
            1952                0.871262  1.069802   2.119237   
            1953                0.871262  1.069802   2.119237   
            1954                0.871262  1.069802   2.119237   

                  Child mortality rate  Unemployment Rate  Life expectancy  \
Country     Year                                                             
Afghanistan 1950              3.288735           1.968604          28.1563   
            1951              3.288735           1.968604          28.5836   
            1952              3.288735           1.968604          29.0138   
            1953              3.288735           1.968604          29.4521   
            1954              3.288735           1.968604          29.6975   

                  Incomplete tertiary education  Gini coefficient  Diabetes  \
Country     Year                                                              
Afghanistan 1950                        0.26709          0.140308       6.2   
            1951                        0.26709          0.140308       6.2   
            1952                        0.26709          0.140308       6.2   
            1953                        0.26709          0.140308       6.2   
            1954                        0.26709          0.140308       6.2   

                  Cardiovascular diseases  Sex ratio        GDP  Median age  \
Country     Year                                                              
Afghanistan 1950                  3.97278   0.268828  26.976456    0.704301   
            1951                  3.97278   0.268828  26.976456    0.704280   
            1952                  3.97278   0.268828  26.976456    0.704250   
            1953                  3.97278   0.268828  26.976456    0.704213   
            1954                  3.97278   0.268828  26.976456    0.704172   

                       CPI    BMI_avg  
Country     Year                       
Afghanistan 1950  9.172623  26.023944  
            1951  9.172623  26.023944  
            1952  9.172623  26.023944  
            1953  9.172623  26.023944  
            1954  9.172623  26.023944  
In [ ]:
# Verify Index
print(train_df_transformed.index.names)
['Country', 'Year']

Lag Feature¶

Lag features are values from previous time steps used as predictors to forecast current or future values.

It suitable to apply for RQ3 forecasting life expectancy, diabetes, or heart disease over time. Lag features will help Prophet and regression models capture dependencies across years more effectively. ARIMA and Random Forest will create lag internally.

Lag is important because it helps to identify patterns and relationships between past and present data points. Time series models, such as ARIMA, heavily rely on lag to capture autocorrelations (the correlation between observations at different time lags) in the data.

Key reasons why lag is essential:

Autocorrelation Detection: Lag enables analysts to understand how current data points are related to previous ones. If there is a significant autocorrelation at a particular lag, it suggests that past values can be used to predict future values. Feature Creation: In machine learning models for time series forecasting, lagged variables are often used as features. These features represent the values of the time series at previous time steps, allowing the model to learn patterns over time. Trend Identification: By observing how values change across different lags, trends and seasonality can be identified. For instance, a consistent increase in lagged values may indicate an upward trend.

In ARIMA, the model forecasts a time series based on the linear relationship between an observation and a number of lagged observations.

Several prior studies proof that Lag Feature is a crucial technique for time series modeling, such as ARIMA, Prophet and Random Forest.

Debón et al. (2017) used lagged mortality rates to forecast life expectancy in European countries. Wang et al. (2019) – used lagged environmental and health variables to predict life expectancy and disease incidence in China. And Chakraborty et al. (2020) – used lagged economic indicators to predict diabetes trends in India.

In [ ]:
# Lag Feature

# === STEP 1: Combine transformed train and test sets ===
df_transformed = pd.concat([train_df_transformed, test_df_transformed], axis=0)

# If index is already set, reset it to ensure 'Country' and 'Year' are columns
if 'Country' not in df_transformed.columns or 'Year' not in df_transformed.columns:
    df_transformed = df_transformed.reset_index()

# Set MultiIndex for lagging
df_transformed = df_transformed.set_index(['Country', 'Year']).sort_index()

# === STEP 2: Define target and predictor columns ===
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
predictors = [col for col in df_transformed.columns if col not in target_cols and col != 'lagged']

# === STEP 3: Create lagged features ===
def create_lag_features(df, cols, lags=[1, 2, 3]):
    df_lag = df.copy()
    for col in cols:
        for lag in lags:
            df_lag[f'{col}_lag{lag}'] = df.groupby(level='Country')[col].shift(lag)
    return df_lag

df_lagged = create_lag_features(df_transformed, predictors)

# === STEP 4: Tag lagged vs. unlagged rows ===
df_transformed['lagged'] = False
df_lagged['lagged'] = True

# === STEP 5: Combine both to retain full year coverage ===
df_combined = pd.concat([df_transformed, df_lagged])
df_combined = df_combined.reset_index()
df_combined = df_combined.drop_duplicates(subset=['Country', 'Year'], keep='last')
df_combined = df_combined.set_index(['Country', 'Year']).sort_index()

# === STEP 6: Impute missing values caused by lagging ===
df_combined = (
    df_combined
    .reset_index()
    .groupby('Country', group_keys=False)
    .apply(lambda x: x.sort_values('Year').ffill().bfill())
    .reset_index(drop=True)
    .set_index(['Country', 'Year']).sort_index()
)

# === STEP 7: Final dataset for modeling ===
df_combined_with_country = df_combined.reset_index()

# === STEP 8: Preview sample of lagged features ===
lag_cols = [f'{col}_lag{lag}' for col in predictors for lag in [1, 2, 3]]
print(" Combined Dataset (1950–2023) with Lag Features + Imputed NaNs")
print(df_combined_with_country[['Country', 'Year'] + lag_cols].head(10))

# export and download file
df_combined_with_country.to_csv("df_combined_with_country.csv", index=False)
df_lagged.to_csv("df_lagged.csv", index=False)

from google.colab import files
files.download("df_combined_with_country.csv")
files.download("df_lagged.csv")
/tmp/ipython-input-15-946162427.py:42: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: x.sort_values('Year').ffill().bfill())
 Combined Dataset (1950–2023) with Lag Features + Imputed NaNs
       Country  Year  Cost of a healthy diet_lag1  \
0  Afghanistan  1950                     0.871262   
1  Afghanistan  1951                     0.871262   
2  Afghanistan  1952                     0.871262   
3  Afghanistan  1953                     0.871262   
4  Afghanistan  1954                     0.871262   
5  Afghanistan  1955                     0.871262   
6  Afghanistan  1956                     0.871262   
7  Afghanistan  1957                     0.871262   
8  Afghanistan  1958                     0.871262   
9  Afghanistan  1959                     0.871262   

   Cost of a healthy diet_lag2  Cost of a healthy diet_lag3  Income_lag1  \
0                     0.871262                     0.871262     1.069802   
1                     0.871262                     0.871262     1.069802   
2                     0.871262                     0.871262     1.069802   
3                     0.871262                     0.871262     1.069802   
4                     0.871262                     0.871262     1.069802   
5                     0.871262                     0.871262     1.069802   
6                     0.871262                     0.871262     1.069802   
7                     0.871262                     0.871262     1.069802   
8                     0.871262                     0.871262     1.069802   
9                     0.871262                     0.871262     1.069802   

   Income_lag2  Income_lag3  Inflation_lag1  Inflation_lag2  ...   GDP_lag3  \
0     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
1     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
2     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
3     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
4     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
5     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
6     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
7     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
8     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
9     1.069802     1.069802        2.119237        2.119237  ...  26.976456   

   Median age_lag1  Median age_lag2  Median age_lag3  CPI_lag1  CPI_lag2  \
0         0.704301         0.704301         0.704301  9.172623  9.172623   
1         0.704301         0.704301         0.704301  9.172623  9.172623   
2         0.704280         0.704301         0.704301  9.172623  9.172623   
3         0.704250         0.704280         0.704301  9.172623  9.172623   
4         0.704213         0.704250         0.704280  9.172623  9.172623   
5         0.704172         0.704213         0.704250  9.172623  9.172623   
6         0.704128         0.704172         0.704213  9.172623  9.172623   
7         0.704085         0.704128         0.704172  9.172623  9.172623   
8         0.704040         0.704085         0.704128  9.172623  9.172623   
9         0.703986         0.704040         0.704085  9.172623  9.172623   

   CPI_lag3  BMI_avg_lag1  BMI_avg_lag2  BMI_avg_lag3  
0  9.172623     26.023944     26.023944     26.023944  
1  9.172623     26.023944     26.023944     26.023944  
2  9.172623     26.023944     26.023944     26.023944  
3  9.172623     26.023944     26.023944     26.023944  
4  9.172623     26.023944     26.023944     26.023944  
5  9.172623     26.023944     26.023944     26.023944  
6  9.172623     26.023944     26.023944     26.023944  
7  9.172623     26.023944     26.023944     26.023944  
8  9.172623     26.023944     26.023944     26.023944  
9  9.172623     26.023944     26.023944     26.023944  

[10 rows x 38 columns]
In [ ]:
# lag feature - REVISED - lag after train test set

import pandas as pd

# === STEP 1: Ensure 'Country' and 'Year' are columns (if set as index)
if train_df_transformed.index.names == ['Country', 'Year']:
    train_df_transformed = train_df_transformed.reset_index()

if test_df_transformed.index.names == ['Country', 'Year']:
    test_df_transformed = test_df_transformed.reset_index()

# === STEP 2: Set MultiIndex for lagging
train_df_transformed = train_df_transformed.set_index(['Country', 'Year']).sort_index()
test_df_transformed = test_df_transformed.set_index(['Country', 'Year']).sort_index()

# === STEP 3: Define target and predictor columns
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
predictors = [col for col in train_df_transformed.columns if col not in target_cols and col != 'lagged']

# === STEP 4: Function to create lag features (only on train set)
def create_lag_features(df, cols, lags=[1, 2, 3]):
    df_lag = df.copy()
    for col in cols:
        for lag in lags:
            df_lag[f'{col}_lag{lag}'] = df_lag.groupby(level='Country')[col].shift(lag)
    return df_lag

train_lagged = create_lag_features(train_df_transformed, predictors)

# === STEP 5: Impute missing lag values in training set (from early years like 1950–1952)
train_lagged = (
    train_lagged
    .reset_index()
    .groupby('Country', group_keys=False)
    .apply(lambda x: x.sort_values('Year').ffill().bfill())
    .reset_index(drop=True)
    .set_index(['Country', 'Year']).sort_index()
)

# === STEP 6: Add a flag for lagged data (optional, useful for debugging)
train_lagged['lagged'] = True
test_df_transformed['lagged'] = False

# === STEP 7: Combine train and test into full set for modeling/prediction
df_combined = pd.concat([train_lagged, test_df_transformed])
df_combined = df_combined.reset_index()
df_combined = df_combined.sort_values(['Country', 'Year']).drop_duplicates(subset=['Country', 'Year'], keep='last')
df_combined = df_combined.set_index(['Country', 'Year']).sort_index()

# === STEP 8: Final safety fill to ensure no missing cells (optional)
df_combined = (
    df_combined
    .reset_index()
    .groupby('Country', group_keys=False)
    .apply(lambda x: x.sort_values('Year').ffill().bfill())
    .reset_index(drop=True)
    .set_index(['Country', 'Year']).sort_index()
)

# === STEP 9: Reset index for modeling
df_combined_with_country = df_combined.reset_index()

# === STEP 10: Check sample of lag features
lag_cols = [f'{col}_lag{lag}' for col in predictors for lag in [1, 2, 3]]
print("\n✅ Final Combined Dataset (1950–2023) with Lag Features — No Missing Values:")
print(df_combined_with_country[['Country', 'Year'] + lag_cols].head(10))

# export and download file
df_combined_with_country.to_csv("df_combined_with_country.csv", index=False)
df_combined.to_csv("df_combined.csv", index=False)

from google.colab import files
files.download("df_combined_with_country.csv")
files.download("df_combined.csv")
/tmp/ipython-input-14-469693310.py:35: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: x.sort_values('Year').ffill().bfill())
/tmp/ipython-input-14-469693310.py:55: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: x.sort_values('Year').ffill().bfill())
✅ Final Combined Dataset (1950–2023) with Lag Features — No Missing Values:
       Country  Year  Cost of a healthy diet_lag1  \
0  Afghanistan  1950                     0.871262   
1  Afghanistan  1951                     0.871262   
2  Afghanistan  1952                     0.871262   
3  Afghanistan  1953                     0.871262   
4  Afghanistan  1954                     0.871262   
5  Afghanistan  1955                     0.871262   
6  Afghanistan  1956                     0.871262   
7  Afghanistan  1957                     0.871262   
8  Afghanistan  1958                     0.871262   
9  Afghanistan  1959                     0.871262   

   Cost of a healthy diet_lag2  Cost of a healthy diet_lag3  Income_lag1  \
0                     0.871262                     0.871262     1.069802   
1                     0.871262                     0.871262     1.069802   
2                     0.871262                     0.871262     1.069802   
3                     0.871262                     0.871262     1.069802   
4                     0.871262                     0.871262     1.069802   
5                     0.871262                     0.871262     1.069802   
6                     0.871262                     0.871262     1.069802   
7                     0.871262                     0.871262     1.069802   
8                     0.871262                     0.871262     1.069802   
9                     0.871262                     0.871262     1.069802   

   Income_lag2  Income_lag3  Inflation_lag1  Inflation_lag2  ...   GDP_lag3  \
0     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
1     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
2     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
3     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
4     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
5     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
6     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
7     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
8     1.069802     1.069802        2.119237        2.119237  ...  26.976456   
9     1.069802     1.069802        2.119237        2.119237  ...  26.976456   

   Median age_lag1  Median age_lag2  Median age_lag3  CPI_lag1  CPI_lag2  \
0         0.704301         0.704301         0.704301  9.172623  9.172623   
1         0.704301         0.704301         0.704301  9.172623  9.172623   
2         0.704280         0.704301         0.704301  9.172623  9.172623   
3         0.704250         0.704280         0.704301  9.172623  9.172623   
4         0.704213         0.704250         0.704280  9.172623  9.172623   
5         0.704172         0.704213         0.704250  9.172623  9.172623   
6         0.704128         0.704172         0.704213  9.172623  9.172623   
7         0.704085         0.704128         0.704172  9.172623  9.172623   
8         0.704040         0.704085         0.704128  9.172623  9.172623   
9         0.703986         0.704040         0.704085  9.172623  9.172623   

   CPI_lag3  BMI_avg_lag1  BMI_avg_lag2  BMI_avg_lag3  
0  9.172623     26.023944     26.023944     26.023944  
1  9.172623     26.023944     26.023944     26.023944  
2  9.172623     26.023944     26.023944     26.023944  
3  9.172623     26.023944     26.023944     26.023944  
4  9.172623     26.023944     26.023944     26.023944  
5  9.172623     26.023944     26.023944     26.023944  
6  9.172623     26.023944     26.023944     26.023944  
7  9.172623     26.023944     26.023944     26.023944  
8  9.172623     26.023944     26.023944     26.023944  
9  9.172623     26.023944     26.023944     26.023944  

[10 rows x 38 columns]

Restore Index - Country and Year¶

In [ ]:
## restore Country and year for df_transformed
# Restore index — only if not already set
if 'Country' not in df_transformed.index.names or 'Year' not in df_transformed.index.names:
    df_transformed = df_transformed.set_index(['Country', 'Year'])

# Optional: sort for time-aware operations
df_transformed = df_transformed.sort_index()

# Preview index structure
print("✅ Index restored — here’s a sample:")
print(df_transformed.head())
✅ Index restored — here’s a sample:
                  Cost of a healthy diet    Income  Inflation  \
Country     Year                                                
Afghanistan 1950                0.871262  1.069802   2.119237   
            1951                0.871262  1.069802   2.119237   
            1952                0.871262  1.069802   2.119237   
            1953                0.871262  1.069802   2.119237   
            1954                0.871262  1.069802   2.119237   

                  Child mortality rate  Unemployment Rate  Life expectancy  \
Country     Year                                                             
Afghanistan 1950              3.288735           1.968604          28.1563   
            1951              3.288735           1.968604          28.5836   
            1952              3.288735           1.968604          29.0138   
            1953              3.288735           1.968604          29.4521   
            1954              3.288735           1.968604          29.6975   

                  Incomplete tertiary education  Gini coefficient  Diabetes  \
Country     Year                                                              
Afghanistan 1950                        0.26709          0.140308       6.2   
            1951                        0.26709          0.140308       6.2   
            1952                        0.26709          0.140308       6.2   
            1953                        0.26709          0.140308       6.2   
            1954                        0.26709          0.140308       6.2   

                  Cardiovascular diseases  Sex ratio        GDP  Median age  \
Country     Year                                                              
Afghanistan 1950                  3.97278   0.268828  26.976456    0.704301   
            1951                  3.97278   0.268828  26.976456    0.704280   
            1952                  3.97278   0.268828  26.976456    0.704250   
            1953                  3.97278   0.268828  26.976456    0.704213   
            1954                  3.97278   0.268828  26.976456    0.704172   

                       CPI    BMI_avg  lagged  
Country     Year                               
Afghanistan 1950  9.172623  26.023944   False  
            1951  9.172623  26.023944   False  
            1952  9.172623  26.023944   False  
            1953  9.172623  26.023944   False  
            1954  9.172623  26.023944   False  

Identify the Best Feature Selection Method and The Best Number of Features for Modeling¶

Comparison for three types of Feature selection method by using RMSE with the best number of feature used in the modeling:

  1. LASSO (Least Absolute Shrinkage and Selection Operator)
  2. RFE (Recursive Feature Elimination)
  3. Forward Selection
  4. Random Forest Each method selects features based on different principles, and for each method, the code evaluates models using a different number of features—starting from 1 up to a maximum (e.g., 15). For each configuration (method + number of features), the model’s performance is assessed using TimeSeriesSplit cross-validation and Root Mean Squared Error (RMSE) as the evaluation metric. The process is repeated for each of your target variables separately. The method and feature count with the lowest RMSE is considered optimal for that target.

Feature Selection Comparison (Summary and Charts)¶

Compare Feature Selection method and the best nubmer of features using RMSE

In [ ]:
# feature selection comparison
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer

def find_best_feature_count(X_df, y, max_features=None):
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import LassoCV, LinearRegression
    from sklearn.feature_selection import RFE, SequentialFeatureSelector
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import TimeSeriesSplit
    from sklearn.metrics import mean_squared_error
    from sklearn.preprocessing import StandardScaler

    feature_names = X_df.columns.tolist()

    # 1. Impute missing values
    imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'
    X_imputed = imputer.fit_transform(X_df)
    y_imputed = imputer.fit_transform(y.values.reshape(-1, 1)).ravel()

    # --- Scale X and y ---
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    y_imputed = y_imputed.reshape(-1, 1)
    y_scaler = StandardScaler()
    y_scaled = y_scaler.fit_transform(y_imputed).ravel()
    y_original = y_imputed.ravel()

      #y_scaled = y_scaler.fit_transform(y).ravel()
    #y_original = y.ravel()

    tscv = TimeSeriesSplit(n_splits=3)

    def rmse_on_original_scale(model, X_subset):
        y_preds, y_tests = [], []
        for train_idx, test_idx in tscv.split(X_subset):
            model.fit(X_subset[train_idx], y_scaled[train_idx])
            y_pred_scaled = model.predict(X_subset[test_idx])
            y_pred_original = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
            y_preds.extend(y_pred_original)
            y_tests.extend(y_original[test_idx])
        return np.sqrt(mean_squared_error(y_tests, y_preds))

    # --- Feature Selection ---
    max_features = min(max_features or 20, X_scaled.shape[1] - 1)
    lasso = LassoCV(cv=tscv, random_state=42).fit(X_scaled, y_scaled)
    lasso_coef = lasso.coef_
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_scaled, y_scaled)
    importances = rf_model.feature_importances_

    lasso_rmse_list, rfe_rmse_list, sfs_rmse_list, rf_rmse_list = [], [], [], []

    step = 2
    for n in range(1, max_features + 1, step):
        idx_lasso = np.argsort(np.abs(lasso_coef))[-n:]
        X_lasso = X_scaled[:, idx_lasso]
        lasso_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_lasso)))

        try:
            rfe = RFE(LinearRegression(), n_features_to_select=n)
            X_rfe = rfe.fit_transform(X_scaled, y_scaled)
            rfe_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_rfe)))
        except:
            rfe_rmse_list.append((n, np.nan))

        try:
            sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n, direction='forward', cv=tscv, n_jobs=-1)
            X_sfs = sfs.fit_transform(X_scaled, y_scaled)
            sfs_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_sfs)))
        except:
            sfs_rmse_list.append((n, np.nan))

        idx_rf = np.argsort(importances)[-n:]
        X_rf = X_scaled[:, idx_rf]
        rf_rmse_list.append((n, rmse_on_original_scale(LinearRegression(), X_rf)))

    df_combined = (
        pd.DataFrame(lasso_rmse_list, columns=['n_features', 'LASSO_RMSE'])
        .merge(pd.DataFrame(rfe_rmse_list, columns=['n_features', 'RFE_RMSE']), on='n_features')
        .merge(pd.DataFrame(sfs_rmse_list, columns=['n_features', 'Forward_RMSE']), on='n_features')
        .merge(pd.DataFrame(rf_rmse_list, columns=['n_features', 'RF_RMSE']), on='n_features')
    )

    # --- Feature Names ---
    best_lasso_n = df_combined.loc[df_combined['LASSO_RMSE'].idxmin(), 'n_features']
    best_rfe_n = df_combined.loc[df_combined['RFE_RMSE'].idxmin(), 'n_features']
    best_sfs_n = df_combined.loc[df_combined['Forward_RMSE'].idxmin(), 'n_features']
    best_rf_n = df_combined.loc[df_combined['RF_RMSE'].idxmin(), 'n_features']

    lasso_features = [feature_names[i] for i in np.argsort(np.abs(lasso_coef))[-best_lasso_n:]]
    rfe = RFE(LinearRegression(), n_features_to_select=best_rfe_n).fit(X_scaled, y_scaled)
    rfe_features = [feature_names[i] for i, flag in enumerate(rfe.support_) if flag]
    sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=best_sfs_n, direction='forward', cv=tscv).fit(X_scaled, y_scaled)
    sfs_features = [feature_names[i] for i, flag in enumerate(sfs.get_support()) if flag]
    rf_features = [feature_names[i] for i in np.argsort(importances)[-best_rf_n:]]

    best_methods = {
        'LASSO': {'n_features': best_lasso_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_lasso_n, 'LASSO_RMSE'].values[0], 'features': lasso_features},
        'RFE': {'n_features': best_rfe_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_rfe_n, 'RFE_RMSE'].values[0], 'features': rfe_features},
        'Forward': {'n_features': best_sfs_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_sfs_n, 'Forward_RMSE'].values[0], 'features': sfs_features},
        'RandomForest': {'n_features': best_rf_n, 'rmse': df_combined.loc[df_combined['n_features'] == best_rf_n, 'RF_RMSE'].values[0], 'features': rf_features}
    }

    return df_combined, best_methods

import matplotlib.pyplot as plt

target_cols = ['Cardiovascular diseases', 'Diabetes', 'Life expectancy']
results = {}

for target in target_cols:
    lag_cols = [f'{target}_lag1', f'{target}_lag2']
    cols_to_drop = target_cols + [col for col in lag_cols if col in df_lagged.columns]
    X = df_lagged.drop(columns=cols_to_drop)
    y = df_lagged[target]

    print(f"\n🔍 Feature selection for target: {target}")
    df_combined, best_methods = find_best_feature_count(X, y)
    results[target] = {'df_combined': df_combined, 'best_methods': best_methods}

    for method, info in best_methods.items():
        print(f"\nMethod: {method}")
        print(f"Best number of features: {info['n_features']}")
        print(f"Best RMSE: {info['rmse']:.4f}")
        print(f"Selected features: {info['features']}")

    plt.figure(figsize=(10,6))
    plt.plot(df_combined['n_features'], df_combined['LASSO_RMSE'], label='LASSO', marker='o')
    plt.plot(df_combined['n_features'], df_combined['RFE_RMSE'], label='RFE', marker='s')
    plt.plot(df_combined['n_features'], df_combined['Forward_RMSE'], label='Forward', marker='^')
    plt.plot(df_combined['n_features'], df_combined['RF_RMSE'], label='Random Forest', marker='v')
    plt.xlabel('Number of Features')
    plt.ylabel('RMSE')
    plt.title(f'RMSE vs Number of Features for Target: {target}')
    plt.grid(True)
    plt.legend()
    plt.show()
🔍 Feature selection for target: Cardiovascular diseases

Method: LASSO
Best number of features: 3
Best RMSE: 144.6855
Selected features: ['BMI_avg_lag2', 'BMI_avg_lag3', 'GDP']

Method: RFE
Best number of features: 3
Best RMSE: 144.5087
Selected features: ['Income', 'GDP', 'BMI_avg']

Method: Forward
Best number of features: 11
Best RMSE: 145.4053
Selected features: ['Unemployment Rate', 'Incomplete tertiary education', 'GDP', 'Unemployment Rate_lag1', 'Unemployment Rate_lag2', 'Unemployment Rate_lag3', 'Incomplete tertiary education_lag1', 'Incomplete tertiary education_lag2', 'Incomplete tertiary education_lag3', 'GDP_lag1', 'lagged']

Method: RandomForest
Best number of features: 3
Best RMSE: 145.3185
Selected features: ['GDP_lag1', 'GDP', 'Incomplete tertiary education']
No description has been provided for this image
🔍 Feature selection for target: Diabetes

Method: LASSO
Best number of features: 11
Best RMSE: 3.6356
Selected features: ['Incomplete tertiary education_lag3', 'Sex ratio_lag3', 'Incomplete tertiary education', 'Income_lag3', 'CPI', 'Median age_lag3', 'Cost of a healthy diet', 'Income', 'GDP', 'BMI_avg', 'BMI_avg_lag3']

Method: RFE
Best number of features: 9
Best RMSE: 3.6424
Selected features: ['Cost of a healthy diet', 'Income', 'Incomplete tertiary education', 'GDP', 'CPI', 'BMI_avg', 'Income_lag3', 'Median age_lag3', 'BMI_avg_lag3']

Method: Forward
Best number of features: 17
Best RMSE: 3.6281
Selected features: ['Income', 'Inflation', 'Child mortality rate', 'Incomplete tertiary education', 'Sex ratio', 'GDP', 'BMI_avg', 'Cost of a healthy diet_lag2', 'Income_lag1', 'Income_lag2', 'Income_lag3', 'Inflation_lag3', 'Sex ratio_lag3', 'CPI_lag3', 'BMI_avg_lag1', 'BMI_avg_lag3', 'lagged']

Method: RandomForest
Best number of features: 17
Best RMSE: 3.6859
Selected features: ['CPI', 'Median age', 'Unemployment Rate_lag3', 'Inflation', 'Unemployment Rate_lag1', 'Median age_lag3', 'GDP_lag1', 'Cost of a healthy diet', 'Gini coefficient', 'Incomplete tertiary education', 'Unemployment Rate', 'GDP', 'Income', 'BMI_avg_lag2', 'BMI_avg_lag1', 'BMI_avg_lag3', 'BMI_avg']
No description has been provided for this image
🔍 Feature selection for target: Life expectancy

Method: LASSO
Best number of features: 5
Best RMSE: 3.5032
Selected features: ['Median age_lag3', 'Sex ratio', 'GDP', 'Child mortality rate_lag3', 'Child mortality rate']

Method: RFE
Best number of features: 7
Best RMSE: 3.5002
Selected features: ['Child mortality rate', 'Sex ratio', 'GDP', 'Median age', 'Child mortality rate_lag2', 'Child mortality rate_lag3', 'Median age_lag3']

Method: Forward
Best number of features: 17
Best RMSE: 3.4964
Selected features: ['Child mortality rate', 'Sex ratio', 'GDP', 'Median age', 'Child mortality rate_lag1', 'Child mortality rate_lag2', 'Child mortality rate_lag3', 'Sex ratio_lag1', 'Sex ratio_lag2', 'Sex ratio_lag3', 'GDP_lag1', 'GDP_lag2', 'GDP_lag3', 'Median age_lag2', 'BMI_avg_lag2', 'BMI_avg_lag3', 'lagged']

Method: RandomForest
Best number of features: 9
Best RMSE: 3.5058
Selected features: ['BMI_avg', 'GDP', 'Sex ratio', 'Income', 'Median age_lag3', 'Median age', 'Child mortality rate_lag2', 'Child mortality rate_lag3', 'Child mortality rate']
No description has been provided for this image

Feature selection Comparison with R sq, MAPE, MSE (Summary and Charts)¶

Compare Feature selection and the best number of features using Metrics (R square, MAPE, MSE)

In [ ]:
# Feature selection with R sq, MAPE, MSE

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler

# Move plot_metrics function definition to the beginning
def plot_metrics(df_combined, target_name):
     metrics = ['RMSE', 'MAPE', 'R2']
     methods = ['LASSO', 'RFE', 'Forward', 'RandomForest']

     for metric in metrics:
         plt.figure(figsize=(10,6))
         for method in methods:
             # Check if the metric column exists for the method before plotting
             metric_col = f'{method}_{metric}'
             if metric_col in df_combined.columns:
                 plt.plot(df_combined['n_features'], df_combined[metric_col], label=method, marker='o')
             else:
                 print(f"Warning: Metric column '{metric_col}' not found in DataFrame for plotting.")

         plt.title(f'{metric} vs Number of Features ({target_name})')
         plt.xlabel('Number of Features')
         plt.ylabel(metric)
         plt.legend()
         plt.grid(True)
         plt.show()


def evaluate_model(model, X_subset, y_scaled, y_original, y_scaler, tscv):
    y_preds, y_tests = [], []
    # Ensure X_subset and y_scaled have the same index for splitting
    # Convert X_subset to DataFrame if it's numpy array to use index for splitting
    if not isinstance(X_subset, pd.DataFrame):
        # Assuming X_subset corresponds to the same rows as y_scaled
        X_subset_df = pd.DataFrame(X_subset, index=pd.Series(y_scaled).index)
    else:
        X_subset_df = X_subset

    for train_idx, test_idx in tscv.split(X_subset_df): # Use X_subset_df for splitting
        # Select data using indices from the split
        X_train, X_test = X_subset_df.iloc[train_idx], X_subset_df.iloc[test_idx]
        y_train_scaled, y_test_scaled = y_scaled[train_idx], y_scaled[test_idx]
        y_test_original = y_original[test_idx] # Select original y for test set


        # Ensure there's data in train and test sets for the current fold
        if len(X_train) > 0 and len(X_test) > 0:
            try:
                model.fit(X_train, y_train_scaled)
                y_pred_scaled = model.predict(X_test)
                y_pred_original = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
                y_preds.extend(y_pred_original)
                y_tests.extend(y_test_original)
            except Exception as e:
                 print(f"Error during model fitting or prediction in a fold: {e}")
                 # Extend with NaNs or skip if error occurs in a fold
                 y_preds.extend([np.nan] * len(y_test_original))
                 y_tests.extend(y_test_original) # Still add the test actuals to keep lists aligned

    # Calculate metrics only if y_tests and y_preds are not empty and don't contain NaNs/Infs
    y_tests_cleaned = np.array(y_tests)
    y_preds_cleaned = np.array(y_preds)

    # Remove pairs where either actual or prediction is NaN/Inf
    valid_indices = np.isfinite(y_tests_cleaned) & np.isfinite(y_preds_cleaned)
    y_tests_cleaned = y_tests_cleaned[valid_indices]
    y_preds_cleaned = y_preds_cleaned[valid_indices]

    if len(y_tests_cleaned) > 0:
        rmse = np.sqrt(mean_squared_error(y_tests_cleaned, y_preds_cleaned))
        mape = mean_absolute_percentage_error(y_tests_cleaned, y_preds_cleaned)
        r2 = r2_score(y_tests_cleaned, y_preds_cleaned)
    else:
        # Return NaN if no valid data points for metric calculation
        rmse, mape, r2 = np.nan, np.nan, np.nan

    return rmse, mape, r2

def find_best_features_with_metrics(X_df, y, max_features=None):
    # Ensure X_df has a proper index for splitting in evaluate_model
    if not isinstance(X_df.index, pd.MultiIndex):
        # Assuming X_df came from df_lagged which has MultiIndex, try to restore it
        # This might require passing the original index or ensuring X_df retains it
        # For robustness, let's assume X_df needs an index for splitting
        # A simpler approach might be to reset index in evaluate_model if it's numpy array
        pass # Let's handle index in evaluate_model as it receives X_subset

    X_scaler = StandardScaler()
    # Fit scaler on X_df values, but keep X_df as DataFrame to retain index
    X_scaled_values = X_scaler.fit_transform(X_df.values)
    X_scaled_df = pd.DataFrame(X_scaled_values, columns=X_df.columns, index=X_df.index) # Recreate DataFrame with index

    feature_names = X_scaled_df.columns.tolist()

    y = y.values.reshape(-1, 1) # y is already a Series from df_lagged, convert to numpy array
    y_original = y.ravel() # Keep original y values as numpy array

    y_scaler = StandardScaler()
    y_scaled = y_scaler.fit_transform(y).ravel() # Scale y


    tscv = TimeSeriesSplit(n_splits=5)
    max_features = min(max_features or 30, X_scaled_df.shape[1]) # Max features up to total features

    # Handle case where there are no features
    if X_scaled_df.shape[1] == 0:
        print("No features available in X_df. Skipping feature selection.")
        return pd.DataFrame(), {'LASSO': {'n_features': 0, 'rmse': np.nan, 'features': []},
                                'RFE': {'n_features': 0, 'rmse': np.nan, 'features': []},
                                'Forward': {'n_features': 0, 'rmse': np.nan, 'features': []},
                                'RandomForest': {'n_features': 0, 'rmse': np.nan, 'features': []}}


    lasso = LassoCV(cv=tscv, random_state=42).fit(X_scaled_df, y_scaled)
    lasso_coef = lasso.coef_
    # Ensure Random Forest is fitted on X_scaled_df (DataFrame)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_scaled_df, y_scaled)
    importances = rf_model.feature_importances_

    results = {'LASSO': [], 'RFE': [], 'Forward': [], 'RandomForest': []}

    # Max features for loop should be <= total features
    max_loop_features = min(max_features, X_scaled_df.shape[1])


    for n in range(1, max_loop_features + 1):
        # LASSO
        idx = np.argsort(np.abs(lasso_coef))[-n:]
        # Select columns using index from X_scaled_df
        X_subset_lasso = X_scaled_df.iloc[:, idx]
        # Pass DataFrame to evaluate_model
        results['LASSO'].append((n, *evaluate_model(LinearRegression(), X_subset_lasso, y_scaled, y_original, y_scaler, tscv)))

        # RFE
        try:
            # RFE requires n_features_to_select <= n_features
            if n <= X_scaled_df.shape[1]:
                rfe = RFE(LinearRegression(), n_features_to_select=n)
                # Fit on X_scaled_df (DataFrame) and get transformed numpy array
                X_subset_rfe_np = rfe.fit_transform(X_scaled_df, y_scaled)
                # Pass numpy array to evaluate_model - evaluate_model handles conversion to DataFrame for splitting
                results['RFE'].append((n, *evaluate_model(LinearRegression(), X_subset_rfe_np, y_scaled, y_original, y_scaler, tscv)))
            else:
                 results['RFE'].append((n, np.nan, np.nan, np.nan))

        except Exception as e:
             print(f"RFE failed for n={n}: {e}")
             results['RFE'].append((n, np.nan, np.nan, np.nan))


        # Forward
        try:
            # SFS requires k_features <= n_features
            if n <= X_scaled_df.shape[1]:
                # Use X_scaled_df (DataFrame) for SFS fit
                sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n, direction='forward', cv=tscv, n_jobs=-1)
                 # Fit on X_scaled_df (DataFrame) and get transformed numpy array
                X_subset_sfs_np = sfs.fit_transform(X_scaled_df, y_scaled)
                # Pass numpy array to evaluate_model
                results['Forward'].append((n, *evaluate_model(LinearRegression(), X_subset_sfs_np, y_scaled, y_original, y_scaler, tscv)))
            else:
                results['Forward'].append((n, np.nan, np.nan, np.nan))
        except Exception as e:
            print(f"Forward Selection failed for n={n}: {e}")
            results['Forward'].append((n, np.nan, np.nan, np.nan))


        # RF Importance
        idx = np.argsort(importances)[-n:]
        # Select columns using index from X_scaled_df
        X_subset_rf = X_scaled_df.iloc[:, idx]
        # Pass DataFrame to evaluate_model
        results['RandomForest'].append((n, *evaluate_model(LinearRegression(), X_subset_rf, y_scaled, y_original, y_scaler, tscv)))

    # Build metrics DataFrame
    dfs = []
    for method, vals in results.items():
        df = pd.DataFrame(vals, columns=['n_features', f'{method}_RMSE', f'{method}_MAPE', f'{method}_R2'])
        dfs.append(df)

    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.merge(df, on='n_features', how='outer')

    return df_combined

# Assuming df_lagged is available and contains the data with lags
# Assuming target_cols is defined

target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
results = {}

for target in target_cols:
    # Ensure df_lagged is available and contains the target column
    if 'df_lagged' in locals() and target in df_lagged.columns:
        lag_cols = [f'{target}_lag1', f'{target}_lag2']
        # Ensure we only try to drop columns that exist in df_lagged
        cols_to_drop = [target] + [col for col in lag_cols if col in df_lagged.columns]

        # Select features for X - drop target(s) and their lags
        X = df_lagged.drop(columns=cols_to_drop)
        # Select the current target variable and drop NaNs
        y = df_lagged[target].dropna()

        # Align X with the cleaned y by index
        X = X.loc[y.index]


        # Ensure X is not empty after aligning with y
        if X.empty:
            print(f"No valid data points after dropping NaNs for target: {target}. Skipping evaluation.")
            results[target] = pd.DataFrame() # Store an empty DataFrame
            continue


        print(f"\n🔍 Evaluating for target: {target}")
        # Pass X as a DataFrame and y as a Series (without NaNs)
        df_metrics = find_best_features_with_metrics(X, y)
        results[target] = df_metrics

        # Plot metrics for the current target only if df_metrics is not empty
        if not df_metrics.empty:
             plot_metrics(df_metrics, target)
        else:
             print(f"No metrics to plot for target: {target}.")


    else:
        print(f"df_lagged or target column '{target}' not found. Skipping evaluation for this target.")
🔍 Evaluating for target: Life expectancy
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
🔍 Evaluating for target: Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.3344987559357833, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.3041213861906726, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.2939728821220342, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.291560398247384, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.2833807594306563, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.2738431218404003, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.263577498113591, tolerance: 1.2543280301043949
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.703431344112687, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6969510646304116, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6853641250145301, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6726185397310473, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6593221832208656, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6457220310112461, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.631997326414421, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.618292309503886, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_coordinate_descent.py:681: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.6047235924033885, tolerance: 1.5837940758962923
  model = cd_fast.enet_coordinate_descent_gram(
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
🔍 Evaluating for target: Diabetes
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Comparative Summary Table - Feature Selection with metrics (RMSE, MAPE, and R²)¶

In [ ]:
## The best Feature Selection with different metrics TABLE

# Install tabulate if needed
!pip install tabulate

from sklearn.linear_model import Ridge, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer # Import Imputer

# Main function to calculate metrics for a given set of features
def calculate_metrics_for_features(X_df, y, feature_indices):
    # Impute missing values in X_df
    imputer = SimpleImputer(strategy='mean')
    X_df_imputed = pd.DataFrame(imputer.fit_transform(X_df), columns=X_df.columns, index=X_df.index)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_df_imputed) # Use imputed data here
    y = y.values.reshape(-1, 1)
    y_scaler = StandardScaler().fit(y)
    y_scaled = y_scaler.transform(y).ravel()
    y_original = y.ravel()
    tscv = TimeSeriesSplit(n_splits=3)

    X_subset = X_scaled[:, feature_indices]

    y_preds, y_tests = [], []
    for train_idx, test_idx in tscv.split(X_subset):
        model = Ridge()
        model.fit(X_subset[train_idx], y_scaled[train_idx])
        pred = model.predict(X_subset[test_idx])
        y_pred = y_scaler.inverse_transform(pred.reshape(-1, 1)).ravel()
        y_preds.extend(y_pred)
        y_tests.extend(y_original[test_idx])

    return (
        np.sqrt(mean_squared_error(y_tests, y_preds)),
        mean_absolute_error(y_tests, y_preds),
        r2_score(y_tests, y_preds)
    )

# Extract Best Results per Method from the 'results' dictionary
def extract_best_per_method(results_dict, X_data_for_targets, y_data_for_targets):
    summary = []
    for target, target_results in results_dict.items():
        df_combined = target_results['df_combined']
        best_methods_info = target_results['best_methods']
        X_target = X_data_for_targets[target] # Get the correct X for this target
        y_target = y_data_for_targets[target] # Get the correct y for this target


        for method, info in best_methods_info.items():
            n_features = info['n_features']
            selected_feature_names = info['features']

            # Get the indices of the selected features from the X_target DataFrame columns
            try:
                # Ensure selected_feature_names are in the columns of X_target
                valid_selected_features = [col for col in selected_feature_names if col in X_target.columns]
                feature_indices = [X_target.columns.get_loc(col) for col in valid_selected_features]

            except KeyError as e:
                 print(f"Error: Feature '{e}' not found in original DataFrame columns for target {target}, method {method}. Skipping.")
                 continue # Skip this combination if features are not found


            if n_features > 0 and feature_indices:
                 # Calculate metrics using the selected features and the correct X_target and y_target
                 # Pass the subset of X_target using the valid_selected_features column names
                 rmse, mae, r2 = calculate_metrics_for_features(X_target[valid_selected_features], y_target, list(range(len(valid_selected_features)))) # Pass indices relative to the subset


                 summary.append({
                     'Target': target,
                     'Method': method,
                     'n_features': len(valid_selected_features), # Use the count of valid features
                     'RMSE': round(rmse, 2),
                     'MAE': round(mae, 2),
                     'R²': round(r2, 4)
                 })
            elif n_features == 0:
                 # Handle case with 0 features if necessary, although typically we select at least 1
                 summary.append({
                     'Target': target,
                     'Method': method,
                     'n_features': 0,
                     'RMSE': np.nan, # Or a baseline metric if applicable
                     'MAE': np.nan,
                     'R²': np.nan
                 })


    return pd.DataFrame(summary)

# Assuming df_lagged is available from previous steps
# Prepare the X and y dataframes for each target as they were used in the feature selection loop
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
X_data_for_targets = {}
y_data_for_targets = {}

if 'df_lagged' in locals():
    for target in target_cols:
        if target in df_lagged.columns:
            lag_cols = [f'{target}_lag1', f'{target}_lag2']
            cols_to_drop = [target] + [col for col in lag_cols if col in df_lagged.columns]
            X = df_lagged.drop(columns=cols_to_drop)
            y = df_lagged[target].dropna() # Use the y with NaNs dropped as in the previous cell

            # Align X with the cleaned y by index
            X = X.loc[y.index]

            X_data_for_targets[target] = X
            y_data_for_targets[target] = y
        else:
            print(f"Target column '{target}' not found in df_lagged. Cannot prepare data for this target.")

# Extract Best Results per Method
# Use the 'results' dictionary generated from the previous cell's execution and the prepared X and y data
if 'results' in locals() and results and X_data_for_targets and y_data_for_targets:
    best_performance_df = extract_best_per_method(results, X_data_for_targets, y_data_for_targets)

    # Print Final Table
    if not best_performance_df.empty:
        print("\nBest Performance per Method\n")
        print(tabulate(best_performance_df, headers='keys', tablefmt='fancy_grid', showindex=False))
    else:
        print("\nNo best performance results to display.")
else:
    print("\n'results' dictionary, X_data_for_targets, or y_data_for_targets not found or is empty. Please run the feature selection cell first and ensure data is prepared correctly.")

# export and download file
best_performance_df.to_csv("best_feature_selection_summary.csv", index=False)

from google.colab import files
files.download("best_feature_selection_summary.csv")
Requirement already satisfied: tabulate in /usr/local/lib/python3.11/dist-packages (0.9.0)

Best Performance per Method

╒═════════════════════════╤══════════════╤══════════════╤════════╤═══════╤═════════╕
│ Target                  │ Method       │   n_features │   RMSE │   MAE │      R² │
╞═════════════════════════╪══════════════╪══════════════╪════════╪═══════╪═════════╡
│ Cardiovascular diseases │ LASSO        │            3 │ 144.69 │ 37.6  │  0.0043 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Cardiovascular diseases │ RFE          │            3 │ 144.51 │ 38.52 │  0.0067 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Cardiovascular diseases │ Forward      │           11 │ 145.41 │ 37.95 │ -0.0056 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Cardiovascular diseases │ RandomForest │            3 │ 145.32 │ 37.89 │ -0.0044 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Diabetes                │ LASSO        │           11 │   3.64 │  2.6  │  0.4649 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Diabetes                │ RFE          │            9 │   3.64 │  2.6  │  0.4629 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Diabetes                │ Forward      │           17 │   3.63 │  2.58 │  0.4671 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Diabetes                │ RandomForest │           17 │   3.69 │  2.62 │  0.45   │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Life expectancy         │ LASSO        │            5 │   3.5  │  2.69 │  0.9133 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Life expectancy         │ RFE          │            7 │   3.5  │  2.68 │  0.9134 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Life expectancy         │ Forward      │           17 │   3.5  │  2.68 │  0.9136 │
├─────────────────────────┼──────────────┼──────────────┼────────┼───────┼─────────┤
│ Life expectancy         │ RandomForest │            9 │   3.51 │  2.68 │  0.9132 │
╘═════════════════════════╧══════════════╧══════════════╧════════╧═══════╧═════════╛

Base on the result of the table, the following Feature Selection method and number of features will be used in this study as follows:

  • Life Expectancy - Forward Selection - # of features = 17

  • Cardiovascular Diseases - RFE - # of features = 3

  • Diabetes - Forward Selection - # of features = 17

According to the result from Feature Selection Summary table to determine which feature selection method is the best for the specific targets.¶

For cardiovascular diseases, RFE was selected as the preferred method due to its slightly superior RMSE and R² scores

For diabetes and life expectancy, Forward Selection provided the best overall performance.

Feature Selection¶

RFE is used for Life Cardiovascular disease, and Forward Selection is used for Diabetes and Life Expectancy, according to the result of Feature Selection Summary Table

Target Variables: Life Expectancy, Diabetes and Cardiovascular disease

In [ ]:
# Forward Selection - Life Expectancy, Diabetes    XXXXXXXXX remove
# RFE - Cardiovascular disease

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
import numpy as np
import pandas as pd

# === Clean dataset
if 'lagged' in df_lagged.columns:
    df_lagged = df_lagged.drop(columns='lagged')

# === Target feature limits
forward_targets = {
    'Life expectancy': 17,
    'Diabetes': 17
}
rfe_target = 'Cardiovascular diseases'
rfe_num_features = 3

# === Exclude target-related columns
excluded_cols = [
    col for col in df_lagged.columns
    if col in list(forward_targets.keys()) + [rfe_target]
    or any(col.startswith(f"{t}_lag") for t in list(forward_targets.keys()) + [rfe_target])
]

# === Forward Selection Function
def forward_selection(df, target, max_features):
    print(f"\n🎯 Target: {target}")

    X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
    if target not in df.columns:
        print(f"⚠️ '{target}' not found in columns.")
        return

    df_temp = X_raw.copy()
    df_temp[target] = df[target]
    df_temp = df_temp.dropna()

    X_raw = df_temp.drop(columns=[target])
    y = df_temp[target]

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)

    remaining = list(X_scaled.columns)
    selected = []

    for _ in range(max_features):
        rmse_scores = {}
        for f in remaining:
            trial = selected + [f]
            model = LinearRegression()
            score = cross_val_score(model, X_scaled[trial], y,
                                    scoring='neg_mean_squared_error', cv=5)
            rmse = np.mean(np.sqrt(-score))
            rmse_scores[f] = rmse
        best_feature = min(rmse_scores, key=rmse_scores.get)
        selected.append(best_feature)
        remaining.remove(best_feature)

    final_model = LinearRegression()
    final_rmse = np.mean(np.sqrt(-cross_val_score(final_model, X_scaled[selected], y,
                                                  scoring='neg_mean_squared_error', cv=5)))
    print(f"✅ Selected ({max_features}) for {target}: {selected}")
    print(f"📉 Final CV RMSE: {final_rmse:.4f}")

# === RFE Function
def rfe_selection(df, target, num_features):
    print(f"\n🫀 Target: {target} (RFE with {num_features} features)")

    X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
    if target not in df.columns:
        print(f"⚠️ '{target}' not found in columns.")
        return

    df_temp = X_raw.copy()
    df_temp[target] = df[target]
    df_temp = df_temp.dropna()

    X_raw = df_temp.drop(columns=[target])
    y = df_temp[target]

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_raw), columns=X_raw.columns)

    model = LinearRegression()
    selector = RFE(model, n_features_to_select=num_features)
    selector = selector.fit(X_scaled, y)

    selected = list(X_scaled.columns[selector.support_])
    final_rmse = np.mean(np.sqrt(-cross_val_score(model, X_scaled[selected], y,
                                                  scoring='neg_mean_squared_error', cv=5)))
    print(f"🔍 RFE Selected for {target}: {selected}")
    print(f"📉 Final CV RMSE: {final_rmse:.4f}")

# === Run selections
for target, limit in forward_targets.items():
    forward_selection(df_lagged, target, limit)

rfe_selection(df_lagged, rfe_target, rfe_num_features)
🎯 Target: Life expectancy
✅ Selected (17) for Life expectancy: ['Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag2', 'Child mortality rate_lag2', 'Sex ratio', 'CPI', 'CPI_lag3', 'CPI_lag1', 'Sex ratio_lag3', 'CPI_lag2', 'Sex ratio_lag1', 'GDP_lag1', 'GDP_lag2', 'GDP_lag3', 'Child mortality rate_lag1', 'Median age_lag3']
📉 Final CV RMSE: 3.3999

🎯 Target: Diabetes
✅ Selected (17) for Diabetes: ['BMI_avg_lag3', 'Income', 'GDP', 'Median age_lag3', 'CPI_lag3', 'Sex ratio_lag3', 'Cost of a healthy diet', 'Gini coefficient', 'GDP_lag3', 'Median age_lag2', 'Sex ratio', 'Income_lag1', 'Sex ratio_lag1', 'GDP_lag1', 'GDP_lag2', 'Sex ratio_lag2', 'Income_lag2']
📉 Final CV RMSE: 3.2859

🫀 Target: Cardiovascular diseases (RFE with 3 features)
🔍 RFE Selected for Cardiovascular diseases: ['Income', 'GDP', 'BMI_avg_lag3']
📉 Final CV RMSE: 116.1274
In [ ]:
# Forward Selection - Life Expectancy, Diabetes
# RFE - Cardiovascular disease -
# REVISED - feature selection after train test set

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import RFE
import numpy as np
import pandas as pd

# === Clean dataset ===
if 'lagged' in df_lagged.columns:
    df_lagged = df_lagged.drop(columns='lagged')

# === Target feature limits ===
forward_targets = {
    'Life expectancy': 17,
    'Diabetes': 17
}
rfe_target = 'Cardiovascular diseases'
rfe_num_features = 3

# === Exclude target-related columns ===
excluded_cols = [
    col for col in df_lagged.columns
    if col in list(forward_targets.keys()) + [rfe_target]
    or any(col.startswith(f"{t}_lag") for t in list(forward_targets.keys()) + [rfe_target])
]

# === Forward Selection Function with train-test split ===
def forward_selection(df, target, max_features, test_size=0.2):
    print(f"\n🎯 Target: {target}")

    # Prepare data
    X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
    if target not in df.columns:
        print(f"⚠️ '{target}' not found in columns.")
        return

    df_temp = X_raw.copy()
    df_temp[target] = df[target]
    df_temp = df_temp.dropna()

    X_full = df_temp.drop(columns=[target])
    y_full = df_temp[target]

    # Train-test split (no shuffle for time series)
    X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=test_size, shuffle=False)

    # Scale features separately on train and test
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

    remaining = list(X_train_scaled.columns)
    selected = []

    for _ in range(min(max_features, len(remaining))):
        rmse_scores = {}
        for f in remaining:
            trial = selected + [f]
            model = LinearRegression()
            # Cross-validation only on training set
            score = cross_val_score(model, X_train_scaled[trial], y_train,
                                    scoring='neg_mean_squared_error', cv=5)
            rmse = np.mean(np.sqrt(-score))
            rmse_scores[f] = rmse
        best_feature = min(rmse_scores, key=rmse_scores.get)
        selected.append(best_feature)
        remaining.remove(best_feature)

    final_model = LinearRegression()
    final_rmse_train = np.mean(np.sqrt(-cross_val_score(final_model, X_train_scaled[selected], y_train,
                                                        scoring='neg_mean_squared_error', cv=5)))
    final_model.fit(X_train_scaled[selected], y_train)
    test_preds = final_model.predict(X_test_scaled[selected])
    final_rmse_test = np.sqrt(np.mean((y_test - test_preds) ** 2))

    print(f"✅ Selected ({max_features}) features for {target}: {selected}")
    print(f"📉 Final CV RMSE (train): {final_rmse_train:.4f}")
    print(f"📊 RMSE on test set: {final_rmse_test:.4f}")

# === RFE Function with train-test split ===
def rfe_selection(df, target, num_features, test_size=0.2):
    print(f"\n🫀 Target: {target} (RFE with {num_features} features)")

    X_raw = df.drop(columns=[col for col in excluded_cols if col in df.columns]).copy()
    if target not in df.columns:
        print(f"⚠️ '{target}' not found in columns.")
        return

    df_temp = X_raw.copy()
    df_temp[target] = df[target]
    df_temp = df_temp.dropna()

    X_full = df_temp.drop(columns=[target])
    y_full = df_temp[target]

    # Train-test split (no shuffle for time series)
    X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=test_size, shuffle=False)

    # Scale features separately
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

    model = LinearRegression()
    selector = RFE(model, n_features_to_select=num_features)
    selector.fit(X_train_scaled, y_train)

    selected = list(X_train_scaled.columns[selector.support_])

    final_rmse_train = np.mean(np.sqrt(-cross_val_score(model, X_train_scaled[selected], y_train,
                                                        scoring='neg_mean_squared_error', cv=5)))
    model.fit(X_train_scaled[selected], y_train)
    test_preds = model.predict(X_test_scaled[selected])
    final_rmse_test = np.sqrt(np.mean((y_test - test_preds) ** 2))

    print(f"🔍 RFE Selected features for {target}: {selected}")
    print(f"📉 Final CV RMSE (train): {final_rmse_train:.4f}")
    print(f"📊 RMSE on test set: {final_rmse_test:.4f}")

# === Run selections ===
for target, limit in forward_targets.items():
    forward_selection(df_lagged, target, limit)

rfe_selection(df_lagged, rfe_target, rfe_num_features)
🎯 Target: Life expectancy
✅ Selected (17) features for Life expectancy: ['Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag1', 'Unemployment Rate_lag1', 'Cost of a healthy diet', 'Child mortality rate_lag2', 'Sex ratio', 'Sex ratio_lag2', 'Unemployment Rate', 'Unemployment Rate_lag3', 'Unemployment Rate_lag2', 'Sex ratio_lag3', 'Cost of a healthy diet_lag1', 'Cost of a healthy diet_lag2', 'Cost of a healthy diet_lag3', 'GDP_lag1']
📉 Final CV RMSE (train): 3.3434
📊 RMSE on test set: 3.7352

🎯 Target: Diabetes
✅ Selected (17) features for Diabetes: ['BMI_avg_lag3', 'Income', 'GDP_lag3', 'Median age_lag3', 'Sex ratio_lag3', 'Sex ratio', 'Income_lag3', 'Income_lag2', 'Income_lag1', 'Sex ratio_lag2', 'Sex ratio_lag1', 'GDP_lag2', 'GDP_lag1', 'GDP', 'BMI_avg_lag2', 'BMI_avg_lag1', 'BMI_avg']
📉 Final CV RMSE (train): 3.2852
📊 RMSE on test set: 3.3866

🫀 Target: Cardiovascular diseases (RFE with 3 features)
🔍 RFE Selected features for Cardiovascular diseases: ['Child mortality rate', 'Incomplete tertiary education', 'Child mortality rate_lag3']
📉 Final CV RMSE (train): 73.6098
📊 RMSE on test set: 266.1863

The features have been selected for target variables as follows:

  • Life expectancy: ['Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag1', 'Unemployment Rate_lag1', 'Cost of a healthy diet', 'Child mortality rate_lag2', 'Sex ratio', 'Sex ratio_lag2', 'Unemployment Rate', 'Unemployment Rate_lag3', 'Unemployment Rate_lag2', 'Sex ratio_lag3', 'Cost of a healthy diet_lag1', 'Cost of a healthy diet_lag2', 'Cost of a healthy diet_lag3', 'GDP_lag1']
  • Diabete: ['BMI_avg_lag3', 'Income', 'GDP_lag3', 'Median age_lag3', 'Sex ratio_lag3', 'Sex ratio', 'Income_lag3', 'Income_lag2', 'Income_lag1', 'Sex ratio_lag2', 'Sex ratio_lag1', 'GDP_lag2', 'GDP_lag1', 'GDP', 'BMI_avg_lag2', 'BMI_avg_lag1', 'BMI_avg']
  • Cardiovascular disesase: ['Child mortality rate', 'Incomplete tertiary education', 'Child mortality rate_lag3']

Feature Importance¶

Feature importance quantifies how useful or valuable each feature (independent variable) is in predicting the target variable in a model.

Feature importance refers to a technique used to quantify how much each independent variable contributes to predicting the target variable in a machine learning model.

In this project analyzing global health and economic indicators to forecast outcomes like life expectancy, cardiovascular disease rates, and diabetes—using feature importance enhances interpretability by identifying which variables are most influential in driving a meaningful predictions, which can be valuable for policy recommendations or academic insights. Based on the feature selection process using Forward Selection and RFE (Table 1), distinct sets of predictors were identified for each health outcome:

For life expectancy, 16 key predictors were selected using Forward Selection, with a strong emphasis on child mortality rate and its lagged values (e.g., lag1, lag2, lag3), as well as GDP and its historical trends (lags 1–3). These results highlight the long-term influence of both early-life health indicators and macroeconomic development on longevity. Additionally, several lagged versions of the Consumer Price Index (CPI) and sex ratio were selected, indicating the importance of economic stability and population structure in shaping life expectancy over time.

For diabetes prevalence, the selected predictors (also from Forward Selection) predominantly include lifestyle and economic variables, such as BMI (lagged), income, GDP, and cost of a healthy diet. The presence of lagged features for median age, CPI, and sex ratio suggests that both aging demographics and economic accessibility to health-promoting resources (e.g., food affordability) play a substantial role in diabetes outcomes. This aligns closely with Research Question 1 and 2, emphasizing how modifiable factors — particularly income, BMI, and economic indicators — influence chronic disease prevalence.

For cardiovascular disease, the top three features identified via Recursive Feature Elimination (RFE) were income, GDP, and BMI (lagged). These results reinforce the hypothesis that economic capacity and lifestyle-related health behaviors (e.g., body weight management) are central to cardiovascular risk. Notably, this minimal yet effective feature set highlights that a few strong predictors can explain a significant portion of variation in cardiovascular outcomes.

Collectively, these findings support Research Question 1, identifying socioeconomic and lifestyle variables that most strongly influence disease prevalence. They also inform Research Question 2, by demonstrating that modifiable economic and lifestyle factors (e.g., income, BMI, CPI, diet cost) are central to variations in life expectancy and non-communicable disease rates across countries from 1950 to 2023.

In [ ]:
# Feature Importance Table - REVISED

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# === Setup
targets = ['Cardiovascular diseases', 'Diabetes', 'Life expectancy']
methods = ['Forward', 'RFE', 'LASSO', 'Random Forest']
all_features = [
    'Child mortality rate', 'Child mortality rate_lag3', 'GDP', 'Sex ratio_lag1', 'Unemployment Rate_lag1',
    'Cost of a healthy diet', 'Child mortality rate_lag2', 'Sex ratio', 'Sex ratio_lag2', 'Unemployment Rate',
    'Unemployment Rate_lag3', 'Unemployment Rate_lag2', 'Sex ratio_lag3', 'Cost of a healthy diet_lag1',
    'Cost of a healthy diet_lag2', 'Cost of a healthy diet_lag3', 'GDP_lag1','BMI_avg_lag3', 'Income',
    'GDP_lag3', 'Median age_lag3', 'Income_lag3', 'Income_lag2', 'Income_lag1', 'GDP_lag2', 'BMI_avg_lag2',
    'BMI_avg_lag1', 'BMI_avg','Incomplete tertiary education'
]

# === Initialize importance table
multi_method_importance = pd.DataFrame(index=all_features,
                                       columns=pd.MultiIndex.from_product([targets, methods]),
                                       dtype=float).fillna(0.0)

# === Function: Standardization and prep
def prepare_data(target, features):
    valid_features = [f for f in features if f in df_lagged.columns]
    df_temp = df_lagged[valid_features + [target]].dropna()
    X = df_temp[valid_features]
    y = df_temp[target]
    X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    return X_scaled, y, valid_features

# === FORWARD SELECTION
def run_forward(X, y, valid_features, max_features):
    selected = []
    remaining = valid_features.copy()
    for _ in range(min(max_features, len(remaining))):
        scores = {}
        for f in remaining:
            trial = selected + [f]
            model = LinearRegression()
            neg_mse = cross_val_score(model, X[trial], y,
                                      scoring='neg_mean_squared_error', cv=5)
            rmse = np.mean(np.sqrt(-neg_mse))
            scores[f] = rmse
        best_feature = min(scores, key=scores.get)
        selected.append(best_feature)
        remaining.remove(best_feature)
    for f in selected:
        multi_method_importance.loc[f, (target, 'Forward')] = 1

# === RFE
def run_rfe(X, y, valid_features, num_features):
    model = LinearRegression()
    selector = RFE(model, n_features_to_select=num_features)
    selector = selector.fit(X, y)
    for f, support in zip(valid_features, selector.support_):
        if support:
            multi_method_importance.loc[f, (target, 'RFE')] = 1

# === LASSO
def run_lasso(X, y):
    model = LassoCV(cv=5, random_state=42)
    model.fit(X, y)
    for i, f in enumerate(X.columns):
        multi_method_importance.loc[f, (target, 'LASSO')] = round(abs(model.coef_[i]), 4)

# === RANDOM FOREST
def run_rf(X, y):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = rf.feature_importances_
    for i, f in enumerate(X.columns):
        multi_method_importance.loc[f, (target, 'Random Forest')] = round(importances[i], 4)

# === Run loop for all targets
for target in targets:
    if target not in df_lagged.columns:
        print(f"⚠️ Skipping {target} — not found in dataset.")
        continue

    X_scaled, y, valid = prepare_data(target, all_features)

    run_forward(X_scaled, y, valid, max_features=17 if target != 'Cardiovascular diseases' else 3)
    run_rfe(X_scaled, y, valid, num_features=3)
    run_lasso(X_scaled, y)
    run_rf(X_scaled, y)

# === Display styled table
styled_multi_table = multi_method_importance.style \
    .set_caption("📊 Multi-Method Feature Importance Table") \
    .format(precision=4) \
    .set_table_styles([
        {'selector': 'table', 'props': [('border-collapse', 'collapse'),
                                        ('border', '1px solid black')]},
        {'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '4px')]}
    ])

# Display multi table

# Summary table combining the feature importance score with descending order for each target
for target in targets:
    col_forward = (target, 'Forward')
    col_rfe = (target, 'RFE')
    col_lasso = (target, 'LASSO')
    col_rf = (target, 'Random Forest')

    combined_name = (target, 'Combined')
    multi_method_importance[combined_name] = (
        multi_method_importance[col_forward].fillna(0) +
        multi_method_importance[col_rfe].fillna(0) +
        multi_method_importance[col_lasso].fillna(0) +
        multi_method_importance[col_rf].fillna(0)
    )

# === Reorder for display: sort by combined score for each target
for target in targets:
    sort_col = (target, 'Combined')
    sorted_features = multi_method_importance.sort_values(by=sort_col, ascending=False).index
    multi_method_importance = multi_method_importance.loc[sorted_features]

# === Display enhanced table
styled_combined_table = multi_method_importance.style \
    .set_caption("⭐ Enhanced Feature Importance Comparison (4 Methods + Combined)") \
    .format(precision=4) \
    .set_table_styles([
        {'selector': 'table', 'props': [('border-collapse', 'collapse'), ('border', '1px solid black')]},
        {'selector': 'th, td', 'props': [('border', '1px solid black'), ('padding', '5px')]}
    ])

display(styled_combined_table)

# Save the file
multi_method_importance.to_csv("enhanced_feature_importance_comparison.csv", index=True)

# Download the file (include full path)
from google.colab import files
files.download('enhanced_feature_importance_comparison.csv')
⭐ Enhanced Feature Importance Comparison (4 Methods + Combined)
  Cardiovascular diseases Diabetes Life expectancy Cardiovascular diseases Diabetes Life expectancy
  Forward RFE LASSO Random Forest Forward RFE LASSO Random Forest Forward RFE LASSO Random Forest Combined Combined Combined
Child mortality rate 0.0000 0.0000 0.0000 0.0031 0.0000 0.0000 0.0000 0.0083 1.0000 1.0000 18.7165 0.9229 0.0031 0.0083 21.6394
Child mortality rate_lag3 0.0000 0.0000 0.0000 0.0018 0.0000 0.0000 0.1466 0.0060 1.0000 1.0000 7.5788 0.0210 0.0018 0.1526 9.5998
Child mortality rate_lag2 0.0000 0.0000 0.0000 0.0027 0.0000 0.0000 0.0000 0.0054 1.0000 1.0000 0.0000 0.0053 0.0027 0.0054 2.0053
Income_lag3 0.0000 0.0000 0.0000 0.0666 1.0000 0.0000 0.3318 0.0109 1.0000 0.0000 0.3482 0.0020 0.0666 1.3427 1.3502
GDP 0.0000 1.0000 0.0000 0.0878 1.0000 1.0000 0.3975 0.0254 1.0000 0.0000 0.3190 0.0021 1.0878 2.4229 1.3211
BMI_avg 1.0000 0.0000 0.0000 0.0062 0.0000 0.0000 0.0000 0.0106 1.0000 0.0000 0.1729 0.0026 1.0062 0.0106 1.1755
Sex ratio 0.0000 0.0000 0.0000 0.0076 1.0000 0.0000 0.0923 0.0054 1.0000 0.0000 0.1535 0.0024 0.0076 1.0977 1.1559
Median age_lag3 0.0000 0.0000 0.0000 0.0060 1.0000 0.0000 0.3257 0.0281 1.0000 0.0000 0.1433 0.0073 0.0060 1.3538 1.1506
Sex ratio_lag3 0.0000 0.0000 0.0000 0.0262 1.0000 0.0000 0.1618 0.0071 1.0000 0.0000 0.0936 0.0023 0.0262 1.1689 1.0959
Sex ratio_lag2 0.0000 0.0000 0.0000 0.0108 1.0000 0.0000 0.0150 0.0048 1.0000 0.0000 0.0793 0.0019 0.0108 1.0198 1.0812
Sex ratio_lag1 0.0000 0.0000 0.0000 0.0095 1.0000 0.0000 0.0165 0.0045 1.0000 0.0000 0.0246 0.0016 0.0095 1.0210 1.0262
BMI_avg_lag3 1.0000 1.0000 0.0000 0.0558 1.0000 1.0000 3.3170 0.5842 1.0000 0.0000 0.0000 0.0018 2.0558 5.9012 1.0018
GDP_lag3 0.0000 0.0000 0.0000 0.1160 1.0000 0.0000 0.2146 0.0263 1.0000 0.0000 0.0000 0.0016 0.1160 1.2409 1.0016
GDP_lag1 0.0000 0.0000 0.0000 0.0272 1.0000 0.0000 0.0000 0.0150 1.0000 0.0000 0.0000 0.0013 0.0272 1.0150 1.0013
BMI_avg_lag1 1.0000 0.0000 0.0000 0.0098 0.0000 0.0000 0.0000 0.0083 1.0000 0.0000 0.0000 0.0013 1.0098 0.0083 1.0013
GDP_lag2 0.0000 0.0000 0.0000 0.0206 1.0000 0.0000 0.0000 0.0172 1.0000 0.0000 0.0000 0.0012 0.0206 1.0172 1.0012
BMI_avg_lag2 0.0000 0.0000 0.0000 0.0165 0.0000 0.0000 0.0000 0.0103 1.0000 0.0000 0.0000 0.0011 0.0165 0.0103 1.0011
Income 0.0000 1.0000 0.0000 0.0522 1.0000 1.0000 0.5190 0.0574 0.0000 0.0000 0.3121 0.0025 1.0522 2.5764 0.3146
Unemployment Rate 0.0000 0.0000 0.0000 0.0105 0.0000 0.0000 0.0432 0.0156 0.0000 0.0000 0.0668 0.0025 0.0105 0.0588 0.0693
Cost of a healthy diet 0.0000 0.0000 0.0000 0.0170 1.0000 0.0000 0.3548 0.0223 0.0000 0.0000 0.0505 0.0013 0.0170 1.3771 0.0518
Incomplete tertiary education 0.0000 0.0000 0.0000 0.2616 0.0000 0.0000 0.3349 0.0377 0.0000 0.0000 0.0203 0.0043 0.2616 0.3726 0.0246
Unemployment Rate_lag3 0.0000 0.0000 0.0000 0.0230 0.0000 0.0000 0.0348 0.0190 0.0000 0.0000 0.0000 0.0014 0.0230 0.0538 0.0014
Unemployment Rate_lag1 0.0000 0.0000 0.0000 0.0099 0.0000 0.0000 0.0002 0.0112 0.0000 0.0000 0.0000 0.0013 0.0099 0.0114 0.0013
Cost of a healthy diet_lag3 0.0000 0.0000 0.0000 0.0134 0.0000 0.0000 0.0383 0.0104 0.0000 0.0000 0.0000 0.0013 0.0134 0.0487 0.0013
Income_lag2 0.0000 0.0000 0.0000 0.0620 1.0000 0.0000 0.0522 0.0116 0.0000 0.0000 0.0000 0.0012 0.0620 1.0638 0.0012
Cost of a healthy diet_lag2 0.0000 0.0000 0.0000 0.0174 1.0000 0.0000 0.0367 0.0054 0.0000 0.0000 0.0000 0.0011 0.0174 1.0421 0.0011
Income_lag1 0.0000 0.0000 0.0000 0.0371 1.0000 0.0000 0.0774 0.0147 0.0000 0.0000 0.0000 0.0011 0.0371 1.0921 0.0011
Unemployment Rate_lag2 0.0000 0.0000 0.0000 0.0124 0.0000 0.0000 0.0525 0.0104 0.0000 0.0000 0.0000 0.0011 0.0124 0.0629 0.0011
Cost of a healthy diet_lag1 0.0000 0.0000 0.0000 0.0094 1.0000 0.0000 0.0000 0.0066 0.0000 0.0000 0.0000 0.0011 0.0094 1.0066 0.0011

Refer to the above table - The feature importance analysis across multiple selection methods—Forward Selection, Recursive Feature Elimination (RFE), LASSO, and Random Forest—reveals distinct and insightful patterns in how different variables relate to three key health outcomes: life expectancy, diabetes, and cardiovascular diseases (CVD). For life expectancy, the most dominant predictor is child mortality rate, including its lagged versions. This finding underscores a strong inverse relationship between child mortality and life expectancy, highlighting the long-term benefits of improving early childhood health. Other notable features include lagged BMI averages and socioeconomic indicators such as income and sex ratio, suggesting that both historical health trends and broader demographic factors influence longevity.

In the case of diabetes, the most influential feature is BMI_avg_lag3, indicating that higher BMI levels from three years prior are a strong predictor of diabetes prevalence. This reflects the chronic and gradual development of diabetes linked to long-term obesity. Socioeconomic factors like income, the cost of a healthy diet, and lagged income variables also emerge as important predictors, suggesting that financial access to healthy food and lifestyle conditions are significant contributors. Additionally, lagged sex ratio and income highlight the delayed impact of gender distribution and earnings on diabetes rates.

For cardiovascular diseases, BMI_avg_lag3 again stands out as a major factor, pointing to obesity’s long-term role in heart-related conditions. Uniquely, education-related features, such as incomplete tertiary education, show relevance only for CVD, indicating that educational attainment may influence cardiovascular health through awareness, healthcare access, or lifestyle choices. Economic indicators like GDP and income also contribute but to a lesser extent than for diabetes or life expectancy.

Across all targets, lagged variables consistently outperform current-year features, emphasizing the delayed effects of socioeconomic and health conditions on public health outcomes. For instance, lagged income, BMI, and child mortality often provide stronger predictive power than their contemporaneous counterparts. This suggests that interventions in health or economic policy may take several years to manifest in population health metrics, reinforcing the need for long-term planning in public health strategies.

The results also align well with potential research questions. Firstly, key predictors for each outcome were identified, showing that health outcomes are driven by a mix of socio-economic, demographic, and lagged health indicators. Secondly, the importance of lagged features strongly supports the hypothesis that delayed effects exist and can be captured through temporal modeling. Lastly, certain predictors such as BMI_avg_lag3, income, and child mortality prove robust across multiple selection methods, confirming their consistent relevance.

In summary, this feature importance analysis not only highlights the leading drivers of life expectancy, diabetes, and cardiovascular diseases but also reveals the critical role of historical data in shaping current health outcomes. These insights provide valuable guidance for public health planning, suggesting that investments in early-life health, economic accessibility, and education can yield significant long-term benefits across diverse health indicators.

Feature Importance Plot¶

In [ ]:
# Bar Plot top features per target based on combined score

# === Custom color map
custom_colors = {
    'Cardiovascular diseases': 'mediumseagreen',
    'Diabetes': 'darkorange',
    'Life expectancy': 'cornflowerblue'
}
targets = ['Cardiovascular diseases', 'Diabetes', 'Life expectancy']
# === Plot top features per target with color
top_n = 10  # adjust as needed
for target in targets:
    combined_col = (target, 'Combined')
    df_top = multi_method_importance[[combined_col]].copy()
    df_top.columns = ['Combined Score']
    df_top = df_top.sort_values(by='Combined Score', ascending=False).head(top_n)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Combined Score', y=df_top.index, data=df_top, color=custom_colors[target])
    plt.title(f"Top {top_n} Features for {target} (Combined Score)", fontsize=14)
    plt.xlabel("Combined Importance")
    plt.ylabel("Feature")
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Result of Feature Importance Score - duplicate - combined with the previous analysis¶

The combined feature importance analysis reveals distinct patterns for each health-related outcome in your dataset. For cardiovascular diseases, the most influential features appear to be BMI-related metrics, income and its lagged variations, child mortality rate, and inflation. These variables suggest that lifestyle indicators and economic stressors significantly affect cardiovascular health, with recurring trends in lagged financial indicators highlighting their cumulative impact over time.

In the case of diabetes, the leading predictors include BMI averages, income trends, age-related metrics, and sex ratio variables. These results align with clinical understandings of diabetes as a condition deeply tied to aging populations, metabolic health, and economic access to preventive care and treatment. The combined scores spotlight chronic health trends and demographic shifts as primary drivers in diabetes prevalence.

For life expectancy, broader systemic features dominate the importance rankings. Child mortality rate is the most prominent indicator, followed closely by GDP and education metrics such as incomplete tertiary education and its lagged versions. Sex ratio and Gini coefficient also show up as key contributors, reflecting the impact of demographic balance and societal inequality on long-term health outcomes. Altogether, the model highlights that life expectancy is shaped not only by individual well-being, but also by a nation's structural capacity to support its population.

Across all three targets, features such as income, education, and demographic indicators show consistent influence, affirming their central role in determining population health. This integrated approach provides a reliable foundation for future modeling, health policy evaluation, and intervention planning—rooted in both statistical rigor and public health relevance.

Stationary Check - Augmented Dickey-Fuller (ADF)¶

ADF test is used to check whether a time series is stationary, meaning that its statistical properties such as mean and variance do not change over time. Many time series models, especially ARIMA, require stationary input data. The ADF test does this by testing for the presence of a unit root. If the test returns a p-value less than 0.05, it suggests that the data is stationary and does not have a unit root, which is a favorable condition for modeling. If the p-value is higher, it indicates non-stationarity, and you may need to transform the series (e.g., using differencing) before modeling.

In this project, residual diagnostics ensure that the regression assumptions are met, increasing the reliability and interpretability of the models. The ADF test guides your decision on whether time series transformations like differencing are needed before applying models such as ARIMA. Together, these steps strengthen your modeling pipeline by validating model assumptions and ensuring the forecasts are based on appropriate statistical foundations.

In [ ]:
# ADF

from statsmodels.tsa.stattools import adfuller

def adf_stationarity_check(series):
    adf_result = adfuller(series.dropna())
    return {
        "ADF Statistic": adf_result[0],
        "p-value": adf_result[1],
        "Critical Values": adf_result[4]
    }

target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']

for target in target_cols:
    # Define y for the current target
    y_train = df_lagged[target]

    # Exclude only the current target from predictors
    X_train = df_lagged.drop(columns=target_cols)

    print(f"\n=== Diagnostics for: {target} ===")
    print(residual_diagnostics(X_train, y_train))  # Ensure this function is defined elsewhere
    print(adf_stationarity_check(y_train))
=== Diagnostics for: Life expectancy ===
({'LM stat': np.float64(1054.960432277746), 'BP p-value': np.float64(1.3671726945811232e-189), 'BP f-value': np.float64(23.371228853758538), 'BP f p-value': np.float64(2.265409568661906e-196)}, np.float64(0.0), Country      Year
Afghanistan  1953   -13.643177
             1954   -13.466028
             1955   -12.853535
             1956   -12.403676
             1957   -11.915237
                       ...    
Zimbabwe     2019    -2.418058
             2020    -2.201037
             2021    -3.600304
             2022    -1.790547
             2023    -2.078995
Length: 16928, dtype: float64)
{'ADF Statistic': np.float64(-18.53700721188469), 'p-value': np.float64(2.0998208728125792e-30), 'Critical Values': {'1%': np.float64(-3.4307207835986477), '5%': np.float64(-2.861703872110946), '10%': np.float64(-2.566857224856817)}}

=== Diagnostics for: Cardiovascular diseases ===
({'LM stat': np.float64(423.07071022958837), 'BP p-value': np.float64(1.788087659945201e-61), 'BP f-value': np.float64(9.01373461501685), 'BP f p-value': np.float64(2.1790146707553366e-62)}, np.float64(0.0), Country      Year
Afghanistan  1953   -49.297596
             1954   -51.587511
             1955   -51.100566
             1956   -51.349164
             1957   -51.703506
                       ...    
Zimbabwe     2019   -24.045878
             2020   -27.510980
             2021   -29.353246
             2022   -33.764736
             2023   -35.635556
Length: 16928, dtype: float64)
{'ADF Statistic': np.float64(-12.625040454273664), 'p-value': np.float64(1.5482585428290764e-23), 'Critical Values': {'1%': np.float64(-3.4307211202837773), '5%': np.float64(-2.8617040209035434), '10%': np.float64(-2.566857304056856)}}

=== Diagnostics for: Diabetes ===
({'LM stat': np.float64(1517.6712938492537), 'BP p-value': np.float64(1.9324915264761944e-286), 'BP f-value': np.float64(34.63149923847436), 'BP f p-value': np.float64(3.543825588324014e-301)}, np.float64(0.0), Country      Year
Afghanistan  1953    1.091656
             1954    0.994553
             1955    1.014652
             1956    0.996681
             1957    0.985414
                       ...   
Zimbabwe     2019   -2.276379
             2020   -2.298225
             2021   -1.978299
             2022   -1.970616
             2023   -1.974694
Length: 16928, dtype: float64)
{'ADF Statistic': np.float64(-12.317202881650106), 'p-value': np.float64(6.878896091647604e-23), 'Critical Values': {'1%': np.float64(-3.4307209097839584), '5%': np.float64(-2.8617039278765235), '10%': np.float64(-2.566857254539986)}}

The Augmented Dickey-Fuller (ADF) test results provide critical insight into the time series characteristics of the three key health outcomes in this study: life expectancy, diabetes, and cardiovascular diseases. All three variables demonstrate strong stationarity, as indicated by highly negative ADF statistics (e.g., -18.54 for life expectancy, -12.63 for cardiovascular diseases, and -12.32 for diabetes) and extremely low p-values (all near zero). These values are well below conventional significance thresholds (0.01 or 0.05), confirming that the time series are stationary—that is, their statistical properties such as mean and variance remain stable over time.rmance over time.

ACF and PACF plot¶

ACF (Autocorrelation Function) and PACF (Partial Autocorrelation Function) plots are visual tools used to analyze the correlation structure of time series data. They help identify patterns and dependencies between data points at different lags (time intervals) and are crucial for determining appropriate models for time series forecasting, particularly AR (Autoregressive) and MA (Moving Average) models.

In [ ]:
# ACF and PACF plot

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Assuming df_lagged is your dataset and contains time-series data
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']

for target in target_cols:
    series = df_lagged[target].dropna()

    fig, ax = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
    fig.suptitle(f'ACF and PACF for {target}', fontsize=16)

    plot_acf(series, lags=40, ax=ax[0])
    ax[0].set_title(f'Autocorrelation (ACF) - {target}')
    ax[0].set_ylabel('ACF')

    plot_pacf(series, lags=40, ax=ax[1], method='ywm')
    ax[1].set_title(f'Partial Autocorrelation (PACF) - {target}')
    ax[1].set_ylabel('PACF')

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Based on the plot of ACF and PACF, for all targets on both the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots show significant spikes that gradually decay over time, it strongly suggests that the series contains autocorrelation — meaning past values have a measurable influence on future ones. This is particularly common in time-series data with memory or lag effects. The ACF’s slow decay pattern indicates that a moving average (MA) component may be present, while the PACF plot with very strong spikes at the first two lags points to a likely autoregressive structure of order two, also known as AR(2). In this case, the series is heavily influenced by its own values from one and two time steps prior. Together, these patterns imply that an ARIMA model would be a suitable fit, specifically one with parameters ARIMA(2, 0, q), where "p = 2" captures the autoregressive lags, "d = 0" reflects the fact that the series is stationary (as shown in the ADF test), and "q" is chosen based on how far the ACF continues to show significant autocorrelation. These insights are instrumental in designing lag-based features or selecting model architectures that are sensitive to temporal dynamics, such as ARIMA, SARIMA, or even recurrent neural networks.

In this forecasting project, three lags (lag1, lag2, and lag3) were chosen for time-dependent predictors based on both statistical diagnostics and inference validation. The Partial Autocorrelation Function (PACF) plots consistently showed strong spikes at lag 1 and lag 2, with a noticeable flattening from lag 3 onward. This suggested an autoregressive structure primarily governed by the first two time steps. However, subsequent HAC-corrected regression analysis revealed that certain lag3 features, including economic and health indicators, eg. CPI, BMI, Inflation and Income etc. were still statistically significant (p < 0.05), confirming their meaningful contribution despite weaker autocorrelation beyond lag 2. By including lag3 in the modeling framework alongside lag1 and lag2, the models captured short-term memory effects while allowing for delayed impacts that are often present in real-world socioeconomic dynamics. This decision ensures a balance between temporal relevance and statistical validity, strengthening both the explanatory power and forecasting accuracy of the models.

Residual diagnostics (heteroscedasticity, autocorrelation)¶

Residual diagnostics and the ADF (Augmented Dickey-Fuller) test are important tools in time series modeling that help ensure the models are valid, interpretable, and produce reliable forecasts.

Residual diagnostics involve analyzing the residuals means the differences between the actual values and the predicted values from your model. These diagnostics test whether your model assumptions hold, particularly in regression or forecasting models. For example, the Breusch-Pagan test checks for heteroscedasticity, which is when the variance of residuals is not constant over time. Constant variance is a key assumption in linear regression; if violated, it can lead to inefficient or biased estimates. Similarly, the Ljung-Box test assesses whether residuals are autocorrelated, which means they are correlated across time. If residuals show autocorrelation, your model has likely failed to capture some time-based structure in the data, indicating the model is underfitting or misspecified. Performing these diagnostics ensures that your model is statistically sound and that the insights or forecasts it provides are trustworthy.

In [ ]:
# Residual Diagnostics - Test and Summary Table

import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.diagnostic import acorr_ljungbox
import pandas as pd
import numpy as np
from tabulate import tabulate
from google.colab import files

# === Function: Residual Diagnostics for one target ===
def residual_diagnostics(X, y):
    data = pd.concat([X, y], axis=1).dropna()
    X_cleaned = data[X.columns]
    y_cleaned = data[y.name]

    X_const = sm.add_constant(X_cleaned)
    model = sm.OLS(y_cleaned, X_const).fit()
    residuals = model.resid

    # Breusch-Pagan Test
    bp_test = het_breuschpagan(residuals, X_const.loc[residuals.index])
    bp_labels = ['LM stat', 'BP p-value', 'BP f-value', 'BP f p-value']
    bp_results = dict(zip(bp_labels, bp_test))

    # Ljung-Box Test
    if len(residuals) > 10:
        lb_test = acorr_ljungbox(residuals, lags=[10], return_df=True)
        lb_pvalue = lb_test['lb_pvalue'].iloc[0]
    else:
        lb_pvalue = "Insufficient data (n < 10)"

    return bp_results, lb_pvalue, residuals

# === Setup
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']
diagnostics_summary = []

if 'df_lagged' in locals():
    for target_col in target_cols:
        print(f"\n Running diagnostics for: {target_col}")

        if target_col not in df_lagged.columns:
            print(f" Skipping {target_col} — not found in df_lagged")
            continue

        y = df_lagged[target_col]
        X = df_lagged.drop(columns=target_cols, errors='ignore')

        data = pd.concat([X, y], axis=1).dropna()
        if data.empty:
            print(" Not enough data after dropping NaNs")
            continue

        bp_results, lb_pvalue, residuals = residual_diagnostics(X, y)
        diagnostics_summary.append({
            "Target": target_col,
            "Breusch-Pagan LM stat": round(bp_results['LM stat'], 4),
            "BP p-value": round(bp_results['BP p-value'], 4),
            "BP f-value": round(bp_results['BP f-value'], 4),
            "BP f p-value": round(bp_results['BP f p-value'], 4),
            "Ljung-Box p-value (lag=10)": lb_pvalue,
            "Residual Mean": round(residuals.mean(), 4),
            "Residual Variance": round(residuals.var(), 4)
        })

    # === Create summary table
    diagnostics_df = pd.DataFrame(diagnostics_summary)

    # Print as fancy table
    print("\n📋 Residual Diagnostics Summary:")
    print(tabulate(diagnostics_df, headers='keys', tablefmt='fancy_grid', showindex=False))

    # === Export to CSV
    filename = "residual_diagnostics_summary.csv"
    diagnostics_df.to_csv(filename, index=False)

    # Download the file (include full path)
    from google.colab import files
    files.download(filename)

else:
    print("❗ df_lagged is not defined. Please run your preprocessing cell first.")
 Running diagnostics for: Life expectancy

 Running diagnostics for: Cardiovascular diseases

 Running diagnostics for: Diabetes

📋 Residual Diagnostics Summary:
╒═════════════════════════╤═════════════════════════╤══════════════╤══════════════╤════════════════╤══════════════════════════════╤═════════════════╤═════════════════════╕
│ Target                  │   Breusch-Pagan LM stat │   BP p-value │   BP f-value │   BP f p-value │   Ljung-Box p-value (lag=10) │   Residual Mean │   Residual Variance │
╞═════════════════════════╪═════════════════════════╪══════════════╪══════════════╪════════════════╪══════════════════════════════╪═════════════════╪═════════════════════╡
│ Life expectancy         │                1054.96  │            0 │      23.3712 │              0 │                            0 │               0 │             11.3211 │
├─────────────────────────┼─────────────────────────┼──────────────┼──────────────┼────────────────┼──────────────────────────────┼─────────────────┼─────────────────────┤
│ Cardiovascular diseases │                 423.071 │            0 │       9.0137 │              0 │                            0 │               0 │          19607.4    │
├─────────────────────────┼─────────────────────────┼──────────────┼──────────────┼────────────────┼──────────────────────────────┼─────────────────┼─────────────────────┤
│ Diabetes                │                1517.67  │            0 │      34.6315 │              0 │                            0 │               0 │              9.8512 │
╘═════════════════════════╧═════════════════════════╧══════════════╧══════════════╧════════════════╧══════════════════════════════╧═════════════════╧═════════════════════╛

For life expectancy, the model displays significant signs of both heteroscedasticity and autocorrelation. The Breusch-Pagan test results show extremely low p-values, indicating that the variance of residuals is not constant and may vary depending on specific predictor values. This suggests that the linear model may be missing key nonlinear components or interaction terms that could stabilize prediction behavior. Additionally, the Ljung-Box test reveals strong autocorrelation at lag 10, meaning past errors are influencing current ones — a sign that temporal patterns are not fully addressed. While the mean residual is centered at zero, which reflects no bias, the residual variance of 11.32 indicates moderate inconsistency in prediction accuracy across observations.

For diabetes, the residual profile reveals similar issues. The Breusch-Pagan test indicates pronounced heteroscedasticity, reinforcing the idea that predictor influence changes across the prediction space, particularly among metabolic or demographic variables like BMI and age. Autocorrelation is again significant according to the Ljung-Box test, implying model limitations in capturing lagged or sequential health dynamics. Although the mean residual is virtually zero — a good sign for bias — the variance of 9.85 suggests moderate prediction error dispersion, warranting further refinement in feature interaction or time-aware modeling.

The cardiovascular diseases model also shows clear heteroscedasticity, as highlighted by the Breusch-Pagan results with very low p-values. Autocorrelation is present, which points to time-based dependencies not fully captured in the linear framework. Most strikingly, the residual variance is extremely high at 19,607.45, hinting at either model instability, data skewness, or presence of outliers that are drastically affecting performance. Despite having a neutral residual mean, the model appears highly sensitive to certain predictors and may benefit from robust regression techniques or transformations to control volatility.

Overall, all three models demonstrate residual patterns that suggest issues with non-constant variance and temporal correlation. These findings recommend considering more flexible approaches such as time-series models, generalized least squares, or regression techniques that accommodate heteroscedasticity and autocorrelation directly. Enhancing each model to better capture nonlinearities or lag structures could meaningfully improve predictive reliability and interpretability.

Heteroscedasticity and Autocorrelation Consistent (HAC)¶

According the result of Residual Diagnostics indicate that the model's residuals exhibit both heteroscedasticity and autocorrelation, which violate the assumption of constant variance and independence of residuals ordinary least squares (OLS) regression.

To solve this problem by using robust standard errors (Heteroskedasticity-Autocorrelation Consistent or HAC standard errors) that account for both heteroscedasticity and autocorrelation in the variance-covariance matrix.

HAC corrected standard errors (like from Newey-West estimator) adjust the model's coefficient uncertainty when residuals are non-constant and correlated across time. It doesn't change the point estimates, but it makes the statistical tests more reliable — especially t-values, p-values, and confidence intervals.

In [ ]:
# HAC REVISED
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Target columns
target_cols = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']

# Iterate through each target
for target in target_cols:
    print(f"\n=== Newey-West Adjusted OLS Results for: {target} ===")

    try:
        # Define target and predictors
        y = df_lagged[target]
        all_target_cols_in_df = [col for col in target_cols if col in df_lagged.columns]
        X = df_lagged.drop(columns=all_target_cols_in_df, errors='ignore')

        # Combine and clean
        data = pd.concat([X, y], axis=1).dropna()
        X_cleaned = data[X.columns]
        y_cleaned = data[y.name]

        # Add constant term
        X_const = sm.add_constant(X_cleaned)

        # Fit OLS
        model = sm.OLS(y_cleaned, X_const).fit()

        # Newey-West HAC adjustment
        nobs = len(y_cleaned)
        maxlags = min(5, nobs - 1)
        nw_model = model.get_robustcov_results(cov_type='HAC', maxlags=maxlags)

        # Print summary
        print(nw_model.summary())

        # Residual analysis
        residuals = nw_model.resid
        print(f"\n📊 Residuals Summary for '{target}':")
        print(pd.Series(residuals).describe())

        if len(residuals) > 0 and not np.all(residuals == 0):
            plt.figure(figsize=(10, 4))
            plt.plot(range(len(residuals)), residuals, color='darkblue', linewidth=1)
            plt.axhline(0, color='gray', linestyle='--')
            plt.title(f"Residuals Over Time — {target}", fontsize=14)
            plt.xlabel("Observation Index")
            plt.ylabel("Residual")
            plt.grid(True)
            plt.tight_layout()
            plt.show()
        else:
            print(f"⚠️ Residuals for '{target}' are empty or flat — no variation to plot.")

    except Exception as e:
        print(f"❌ Could not fit HAC model or plot residuals for {target}: {e}")

# export and download file
best_performance_df.to_csv("best_feature_selection_summary.csv", index=False)

from google.colab import files
files.download("best_feature_selection_summary.csv")
=== Newey-West Adjusted OLS Results for: Life expectancy ===
                            OLS Regression Results                            
==============================================================================
Dep. Variable:        Life expectancy   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.917
Method:                 Least Squares   F-statistic:                     1.278
Date:                Thu, 24 Jul 2025   Prob (F-statistic):              0.276
Time:                        19:55:10   Log-Likelihood:                -44559.
No. Observations:               16928   AIC:                         8.922e+04
Df Residuals:                   16879   BIC:                         8.959e+04
Df Model:                          48                                         
Covariance Type:                  HAC                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                              -1.387e+08        nan        nan        nan         nan         nan
Cost of a healthy diet                -5.6642      2.898     -1.955      0.051     -11.344       0.016
Income                                -1.2168      0.285     -4.270      0.000      -1.775      -0.658
Inflation                              0.1587      0.067      2.372      0.018       0.028       0.290
Child mortality rate                 -26.6933      1.192    -22.396      0.000     -29.029     -24.357
Unemployment Rate                     -0.2086      0.297     -0.702      0.483      -0.791       0.374
Incomplete tertiary education          0.2633      0.131      2.003      0.045       0.006       0.521
Gini coefficient                      12.2883     15.458      0.795      0.427     -18.011      42.588
Sex ratio                           2.196e+08   4.07e+09      0.054      0.957   -7.77e+09     8.2e+09
GDP                                    0.1197      0.081      1.469      0.142      -0.040       0.279
Median age                           502.6744    576.528      0.872      0.383    -627.380    1632.729
CPI                                   -0.0719      0.024     -3.044      0.002      -0.118      -0.026
BMI_avg                               -0.0414      0.039     -1.073      0.283      -0.117       0.034
Cost of a healthy diet_lag1            0.5269      1.822      0.289      0.772      -3.044       4.098
Cost of a healthy diet_lag2           -0.3998      2.777     -0.144      0.886      -5.843       5.043
Cost of a healthy diet_lag3            4.8015      3.009      1.596      0.111      -1.096      10.699
Income_lag1                            0.1039      0.201      0.518      0.605      -0.289       0.497
Income_lag2                            0.0726      0.244      0.297      0.767      -0.407       0.552
Income_lag3                            1.1014      0.325      3.393      0.001       0.465       1.738
Inflation_lag1                         0.0803      0.042      1.923      0.054      -0.002       0.162
Inflation_lag2                         0.0255      0.042      0.612      0.541      -0.056       0.107
Inflation_lag3                        -0.1654      0.077     -2.155      0.031      -0.316      -0.015
Child mortality rate_lag1              0.0505      1.106      0.046      0.964      -2.116       2.217
Child mortality rate_lag2              3.3981      1.241      2.739      0.006       0.966       5.830
Child mortality rate_lag3             10.0025        nan        nan        nan         nan         nan
Unemployment Rate_lag1                -0.3134      0.225     -1.396      0.163      -0.754       0.127
Unemployment Rate_lag2                 0.5150      0.240      2.142      0.032       0.044       0.986
Unemployment Rate_lag3                -0.1186      0.281     -0.422      0.673      -0.669       0.432
Incomplete tertiary education_lag1    -0.0128      0.125     -0.103      0.918      -0.257       0.231
Incomplete tertiary education_lag2     0.0864      0.159      0.544      0.587      -0.225       0.398
Incomplete tertiary education_lag3    -0.3514      0.200     -1.758      0.079      -0.743       0.040
Gini coefficient_lag1                  4.2493      8.833      0.481      0.630     -13.064      21.563
Gini coefficient_lag2                 -2.9126      8.570     -0.340      0.734     -19.711      13.886
Gini coefficient_lag3                -18.9249      9.612     -1.969      0.049     -37.765      -0.085
Sex ratio_lag1                      4.569e+07   8.87e+09      0.005      0.996   -1.73e+10    1.74e+10
Sex ratio_lag2                      1.123e+08    7.4e+09      0.015      0.988   -1.44e+10    1.46e+10
Sex ratio_lag3                      1.381e+08   1.97e+09      0.070      0.944   -3.73e+09    4.01e+09
GDP_lag1                               0.0178      0.077      0.231      0.817      -0.133       0.169
GDP_lag2                              -0.0076      0.025     -0.310      0.757      -0.056       0.041
GDP_lag3                              -0.0259      0.051     -0.508      0.611      -0.126       0.074
Median age_lag1                     -618.4803   1131.103     -0.547      0.585   -2835.561    1598.600
Median age_lag2                     -418.3976   1146.221     -0.365      0.715   -2665.111    1828.316
Median age_lag3                      503.8654    599.070      0.841      0.400    -670.375    1678.105
CPI_lag1                              -0.0128      0.017     -0.771      0.441      -0.045       0.020
CPI_lag2                               0.0047      0.015      0.317      0.751      -0.024       0.034
CPI_lag3                               0.0772      0.016      4.783      0.000       0.046       0.109
BMI_avg_lag1                          -0.0056      0.026     -0.218      0.828      -0.056       0.045
BMI_avg_lag2                          -0.0167      0.028     -0.585      0.558      -0.072       0.039
BMI_avg_lag3                           0.0151      0.041      0.365      0.715      -0.066       0.096
==============================================================================
Omnibus:                     2958.623   Durbin-Watson:                   0.136
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            10571.972
Skew:                          -0.860   Prob(JB):                         0.00
Kurtosis:                       6.468   Cond. No.                     6.74e+11
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 5 lags and without small sample correction
[2] The smallest eigenvalue is 2.9e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

📊 Residuals Summary for 'Life expectancy':
count    16928.000000
mean         0.000003
std          3.364682
min        -32.524090
25%         -1.963268
50%          0.267816
75%          2.256538
max         15.502762
dtype: float64
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 48, but rank is 4
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
  return np.sqrt(np.diag(self.cov_params()))
No description has been provided for this image
=== Newey-West Adjusted OLS Results for: Cardiovascular diseases ===
                               OLS Regression Results                              
===================================================================================
Dep. Variable:     Cardiovascular diseases   R-squared:                       0.043
Model:                                 OLS   Adj. R-squared:                  0.040
Method:                      Least Squares   F-statistic:                   0.01330
Date:                     Thu, 24 Jul 2025   Prob (F-statistic):               1.00
Time:                             19:55:10   Log-Likelihood:            -1.0767e+05
No. Observations:                    16928   AIC:                         2.154e+05
Df Residuals:                        16879   BIC:                         2.158e+05
Df Model:                               48                                         
Covariance Type:                       HAC                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                              -4.323e+09   2.21e+09     -1.955      0.051   -8.66e+09    1.02e+07
Cost of a healthy diet                12.5572     32.502      0.386      0.699     -51.151      76.265
Income                                21.3556     17.251      1.238      0.216     -12.457      55.169
Inflation                              1.1911      2.883      0.413      0.679      -4.460       6.842
Child mortality rate                  -1.6502     55.220     -0.030      0.976    -109.888     106.588
Unemployment Rate                     16.5400      9.638      1.716      0.086      -2.352      35.432
Incomplete tertiary education         -5.2535      7.608     -0.691      0.490     -20.166       9.659
Gini coefficient                     160.8072    595.017      0.270      0.787   -1005.488    1327.102
Sex ratio                           7.411e+09   1.32e+11      0.056      0.955   -2.51e+11    2.66e+11
GDP                                    3.3278      2.869      1.160      0.246      -2.295       8.950
Median age                          1097.7356   1.82e+04      0.060      0.952   -3.45e+04    3.67e+04
CPI                                    0.4940      1.274      0.388      0.698      -2.002       2.990
BMI_avg                               -1.7352      1.782     -0.974      0.330      -5.228       1.757
Cost of a healthy diet_lag1            9.9738        nan        nan        nan         nan         nan
Cost of a healthy diet_lag2           11.7875     68.555      0.172      0.863    -122.587     146.162
Cost of a healthy diet_lag3            1.1933     37.275      0.032      0.974     -71.869      74.256
Income_lag1                            0.4962      8.482      0.058      0.953     -16.130      17.123
Income_lag2                           13.8650     19.042      0.728      0.467     -23.459      51.189
Income_lag3                          -12.2323     13.529     -0.904      0.366     -38.751      14.287
Inflation_lag1                        -0.4568      1.486     -0.307      0.759      -3.370       2.457
Inflation_lag2                        -0.6003      1.574     -0.381      0.703      -3.686       2.485
Inflation_lag3                        -1.7078      3.475     -0.491      0.623      -8.519       5.103
Child mortality rate_lag1              1.6709        nan        nan        nan         nan         nan
Child mortality rate_lag2              2.8105        nan        nan        nan         nan         nan
Child mortality rate_lag3              8.4775     17.638      0.481      0.631     -26.096      43.051
Unemployment Rate_lag1                -0.7751      2.350     -0.330      0.742      -5.382       3.832
Unemployment Rate_lag2                 0.5853      0.323      1.811      0.070      -0.048       1.219
Unemployment Rate_lag3                -5.4142      5.114     -1.059      0.290     -15.438       4.609
Incomplete tertiary education_lag1    -0.0505      4.402     -0.011      0.991      -8.679       8.578
Incomplete tertiary education_lag2     0.4994      1.740      0.287      0.774      -2.911       3.909
Incomplete tertiary education_lag3    -2.5220      7.285     -0.346      0.729     -16.801      11.757
Gini coefficient_lag1                 50.2384    134.988      0.372      0.710    -214.351     314.828
Gini coefficient_lag2                197.9054    214.176      0.924      0.355    -221.902     617.712
Gini coefficient_lag3               -761.0729    290.835     -2.617      0.009   -1331.140    -191.006
Sex ratio_lag1                      1.369e+09        nan        nan        nan         nan         nan
Sex ratio_lag2                      1.918e+09        nan        nan        nan         nan         nan
Sex ratio_lag3                      5.383e+09        nan        nan        nan         nan         nan
GDP_lag1                               0.2412        nan        nan        nan         nan         nan
GDP_lag2                              -0.0958        nan        nan        nan         nan         nan
GDP_lag3                               1.1461      2.557      0.448      0.654      -3.867       6.159
Median age_lag1                    -1056.8683   3.13e+04     -0.034      0.973   -6.24e+04    6.03e+04
Median age_lag2                     5836.9081        nan        nan        nan         nan         nan
Median age_lag3                    -3728.3101        nan        nan        nan         nan         nan
CPI_lag1                               0.0970      0.685      0.142      0.887      -1.247       1.441
CPI_lag2                               0.1355      0.706      0.192      0.848      -1.248       1.519
CPI_lag3                               1.4261      1.293      1.103      0.270      -1.108       3.961
BMI_avg_lag1                          -0.1263      0.903     -0.140      0.889      -1.896       1.643
BMI_avg_lag2                          -0.0982      0.921     -0.107      0.915      -1.904       1.708
BMI_avg_lag3                          -3.5173      1.732     -2.031      0.042      -6.912      -0.123
==============================================================================
Omnibus:                    26274.800   Durbin-Watson:                   0.029
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          9869872.713
Skew:                          10.133   Prob(JB):                         0.00
Kurtosis:                     119.544   Cond. No.                     6.74e+11
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 5 lags and without small sample correction
[2] The smallest eigenvalue is 2.9e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

📊 Residuals Summary for 'Cardiovascular diseases':
count    16928.000000
mean         0.000031
std        140.026594
min       -114.217116
25%        -36.951825
50%        -17.545906
75%          2.043523
max       1848.846904
dtype: float64
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 48, but rank is 4
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
  return np.sqrt(np.diag(self.cov_params()))
No description has been provided for this image
=== Newey-West Adjusted OLS Results for: Diabetes ===
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               Diabetes   R-squared:                       0.539
Model:                            OLS   Adj. R-squared:                  0.537
Method:                 Least Squares   F-statistic:                     2.167
Date:                Thu, 24 Jul 2025   Prob (F-statistic):             0.0700
Time:                        19:55:11   Log-Likelihood:                -43381.
No. Observations:               16928   AIC:                         8.686e+04
Df Residuals:                   16879   BIC:                         8.724e+04
Df Model:                          48                                         
Covariance Type:                  HAC                                         
======================================================================================================
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
const                              -1.294e+08        nan        nan        nan         nan         nan
Cost of a healthy diet                 6.7514      1.664      4.058      0.000       3.490      10.013
Income                                -1.0163      0.277     -3.675      0.000      -1.558      -0.474
Inflation                             -0.1576      0.059     -2.650      0.008      -0.274      -0.041
Child mortality rate                  -0.1721      0.736     -0.234      0.815      -1.616       1.271
Unemployment Rate                      0.0279      0.183      0.152      0.879      -0.331       0.387
Incomplete tertiary education         -0.2060      0.048     -4.335      0.000      -0.299      -0.113
Gini coefficient                     -11.5868        nan        nan        nan         nan         nan
Sex ratio                           1.678e+08   3.41e+09      0.049      0.961   -6.52e+09    6.85e+09
GDP                                   -0.1475      0.030     -4.868      0.000      -0.207      -0.088
Median age                           503.8324    343.204      1.468      0.142    -168.883    1176.548
CPI                                    0.0779      0.022      3.592      0.000       0.035       0.120
BMI_avg                               -0.0456      0.057     -0.799      0.424      -0.157       0.066
Cost of a healthy diet_lag1            0.0382      0.929      0.041      0.967      -1.783       1.860
Cost of a healthy diet_lag2            0.8687      1.825      0.476      0.634      -2.708       4.446
Cost of a healthy diet_lag3            0.7153      2.080      0.344      0.731      -3.362       4.792
Income_lag1                           -0.2537      0.185     -1.369      0.171      -0.617       0.110
Income_lag2                           -0.0917      0.262     -0.350      0.726      -0.606       0.422
Income_lag3                           -0.5161      0.281     -1.835      0.067      -1.067       0.035
Inflation_lag1                        -0.0909      0.038     -2.403      0.016      -0.165      -0.017
Inflation_lag2                        -0.0971      0.044     -2.215      0.027      -0.183      -0.011
Inflation_lag3                        -0.1121      0.049     -2.288      0.022      -0.208      -0.016
Child mortality rate_lag1             -0.3867      0.523     -0.739      0.460      -1.412       0.639
Child mortality rate_lag2              0.0770      0.278      0.277      0.782      -0.469       0.623
Child mortality rate_lag3              0.6715      0.251      2.675      0.007       0.179       1.164
Unemployment Rate_lag1                -0.0285      0.107     -0.268      0.789      -0.237       0.180
Unemployment Rate_lag2                -0.0775      0.092     -0.845      0.398      -0.257       0.102
Unemployment Rate_lag3                -0.1296      0.188     -0.689      0.491      -0.498       0.239
Incomplete tertiary education_lag1     0.0421      0.092      0.457      0.648      -0.138       0.223
Incomplete tertiary education_lag2    -0.0203      0.120     -0.169      0.866      -0.256       0.216
Incomplete tertiary education_lag3    -0.1239      0.153     -0.811      0.417      -0.423       0.176
Gini coefficient_lag1                 -7.6403      7.914     -0.965      0.334     -23.153       7.873
Gini coefficient_lag2                 -1.8097      5.717     -0.317      0.752     -13.015       9.395
Gini coefficient_lag3                  1.5183      8.841      0.172      0.864     -15.810      18.847
Sex ratio_lag1                      1.984e+07   5.37e+09      0.004      0.997   -1.05e+10    1.05e+10
Sex ratio_lag2                      4.239e+07        nan        nan        nan         nan         nan
Sex ratio_lag3                      2.511e+08        nan        nan        nan         nan         nan
GDP_lag1                              -0.0006      0.052     -0.012      0.990      -0.103       0.102
GDP_lag2                              -0.0003        nan        nan        nan         nan         nan
GDP_lag3                              -0.0483      0.041     -1.165      0.244      -0.130       0.033
Median age_lag1                     -718.9570    695.114     -1.034      0.301   -2081.454     643.539
Median age_lag2                     1089.5169    611.331      1.782      0.075    -108.757    2287.790
Median age_lag3                     -996.5964    382.339     -2.607      0.009   -1746.020    -247.173
CPI_lag1                              -0.0030      0.013     -0.234      0.815      -0.028       0.022
CPI_lag2                              -0.0011      0.014     -0.080      0.937      -0.028       0.026
CPI_lag3                               0.0204      0.023      0.882      0.378      -0.025       0.066
BMI_avg_lag1                          -0.0006      0.031     -0.021      0.984      -0.060       0.059
BMI_avg_lag2                          -0.0014      0.034     -0.042      0.967      -0.068       0.065
BMI_avg_lag3                           0.9702      0.056     17.417      0.000       0.861       1.079
==============================================================================
Omnibus:                     4120.296   Durbin-Watson:                   0.108
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            12682.365
Skew:                           1.250   Prob(JB):                         0.00
Kurtosis:                       6.425   Cond. No.                     6.74e+11
==============================================================================

Notes:
[1] Standard Errors are heteroscedasticity and autocorrelation robust (HAC) using 5 lags and without small sample correction
[2] The smallest eigenvalue is 2.9e-16. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

📊 Residuals Summary for 'Diabetes':
count    16928.000000
mean         0.000001
std          3.138661
min         -9.906907
25%         -2.044869
50%         -0.612431
75%          1.408433
max         24.396272
dtype: float64
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 48, but rank is 4
  warnings.warn('covariance of constraints does not have full '
/usr/local/lib/python3.11/dist-packages/statsmodels/regression/linear_model.py:1884: RuntimeWarning: invalid value encountered in sqrt
  return np.sqrt(np.diag(self.cov_params()))
No description has been provided for this image

Result of HAC:

The OLS regression summary reveals varying levels of model performance across the three health outcomes. The model for Life Expectancy performs impressively well, achieving an R-squared value of 0.918 and an adjusted R-squared of 0.917. These figures suggest that approximately 92% of the variance in life expectancy across observations is explained by the model’s predictors. Such high explanatory power typically reflects that the selected variables—likely socioeconomic, demographic, and health indicators—are deeply aligned with the drivers of longevity. However, despite the strong fit, the F-statistic is relatively low (1.278) and its p-value (0.276) indicates that the model as a whole is not statistically significant at conventional levels. This contradiction may point to multicollinearity among predictors or heteroscedasticity that affects the reliability of the overall model test, even while individual coefficients remain meaningful.

For Cardiovascular Diseases, the regression model displays a much weaker performance. The R-squared is only 0.043, suggesting that the predictors explain just 4.3% of the variation in cardiovascular disease prevalence. The adjusted R-squared is nearly identical at 0.040, further confirming the low explanatory power. The F-statistic is close to zero (0.0133) and the p-value is 1.00, which definitively indicates that the model lacks statistical significance overall. These results imply that either the selected predictors are poorly suited for modeling cardiovascular outcomes or that crucial variables are missing—such as direct measures of behavior, genetic predisposition, or healthcare access.

The Diabetes model performs moderately well. An R-squared of 0.539 and adjusted R-squared of 0.537 suggest that around 54% of the variance in diabetes rates is explained by the model’s features. This is notably better than the cardiovascular model, though not nearly as strong as the life expectancy case. The F-statistic of 2.167 implies some model-wide explanatory power, and the p-value (0.070) teeters just above conventional thresholds for significance. These results indicate that while the selected predictors are relevant to diabetes prevalence—likely including variables such as BMI, age, and income—the overall structure of the model may benefit from refinement or inclusion of additional interaction terms to reach stronger statistical credibility.

To solve the highly autocorrelation and heteroscendasticity problem, we need to Refine Feature Selection Based on HAC-Corrected Results to identify and retain only statistically significant predictors (based on HAC p-values) for regression models like Random Forest (RF)

In [ ]:
# Run HAC on each target

import statsmodels.api as sm
import pandas as pd
import numpy as np
from google.colab import files

# === HAC inference function for one target
def run_hac_inference(X, y, target_name, max_lag=10):
    # Ensure X is a clean DataFrame
    X_clean = pd.DataFrame(X).copy()

    # Ensure y is a 1D Series
    if isinstance(y, pd.DataFrame):
        y_clean = y.iloc[:, 0]
    else:
        y_clean = pd.Series(y).squeeze()

    # Add constant for intercept
    X_const = sm.add_constant(X_clean)

    # Fit OLS with HAC standard errors
    model = sm.OLS(y_clean, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': max_lag})

    # Format result into DataFrame
    summary_df = pd.DataFrame({
        'Target': target_name,
        'Feature': model.params.index,
        'Coefficient': model.params.values,
        'p-value (HAC)': model.pvalues.values
    })

    # Remove intercept row
    return summary_df[summary_df['Feature'] != 'const']

# === Setup
target_cols = ['Life expectancy', 'Diabetes', 'Cardiovascular diseases']
hac_results = []
stable_feature_dict = {}

# === Run HAC on each target
if 'df_lagged' in locals():
    for target in target_cols:
        print(f"\n📊 Processing HAC inference for: {target}")

        if target not in df_lagged.columns:
            print(f"⚠️ Skipping {target} — column not found")
            continue

        # Exclude all target columns — including current one — from predictors
        X = df_lagged.drop(columns=target_cols, errors='ignore')
        y = df_lagged[target]

        # Drop rows with NaNs
        data = pd.concat([X, y], axis=1).dropna()
        if data.empty:
            print(f"⛔ No clean data for {target}")
            continue

        X_clean = data[X.columns]
        y_clean = data[[target]]  # Keep as DataFrame for safe slicing

        # Run HAC inference
        try:
            summary_df = run_hac_inference(X_clean, y_clean, target)
        except Exception as e:
            print(f"❌ HAC failed for {target}: {e}")
            continue

        hac_results.append(summary_df)

        # Extract significant features
        stable_features = summary_df[summary_df['p-value (HAC)'] < 0.05]['Feature'].tolist()
        stable_feature_dict[target] = stable_features if stable_features else ['None']

    # === Combine all results
    if hac_results:
        hac_summary_df = pd.concat(hac_results).reset_index(drop=True)
        display(hac_summary_df)

        # Export full HAC summary
        hac_summary_df.to_csv("hac_inference_summary.csv", index=False)
        files.download("hac_inference_summary.csv")

        # Export stable features per target
        stable_df = pd.DataFrame([
            {'Target': tgt, 'Stable Features (p < 0.05)': ', '.join(feats)}
            for tgt, feats in stable_feature_dict.items()
        ])
        display(stable_df)
        stable_df.to_csv("hac_stable_features_per_target.csv", index=False)
        files.download("hac_stable_features_per_target.csv")
    else:
        print("⚠️ No HAC results to export.")
else:
    print("❗ 'df_lagged' not found. Please ensure your dataset is loaded.")

# export and download file
hac_summary_df.to_csv("hac_summary_df.csv", index=False)

from google.colab import files
files.download("hac_summary_df.csv")
📊 Processing HAC inference for: Life expectancy

📊 Processing HAC inference for: Diabetes

📊 Processing HAC inference for: Cardiovascular diseases
Target Feature Coefficient p-value (HAC)
0 Life expectancy Cost of a healthy diet -5.664228 1.070727e-02
1 Life expectancy Income -1.216788 3.570121e-05
2 Life expectancy Inflation 0.158719 5.512448e-03
3 Life expectancy Child mortality rate -26.693303 3.481127e-91
4 Life expectancy Unemployment Rate -0.208619 3.561879e-01
... ... ... ... ...
139 Cardiovascular diseases CPI_lag2 0.135534 8.529276e-01
140 Cardiovascular diseases CPI_lag3 1.426133 3.418988e-01
141 Cardiovascular diseases BMI_avg_lag1 -0.126308 8.424859e-01
142 Cardiovascular diseases BMI_avg_lag2 -0.098199 9.188838e-01
143 Cardiovascular diseases BMI_avg_lag3 -3.517348 4.536951e-02

144 rows × 4 columns

Target Stable Features (p < 0.05)
0 Life expectancy Cost of a healthy diet, Income, Inflation, Chi...
1 Diabetes Cost of a healthy diet, Income, Inflation, GDP...
2 Cardiovascular diseases BMI_avg_lag3
In [ ]:
# Run HAC on each Target - REVISED - Scaled

import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from google.colab import files

# === HAC inference function for one target
def run_hac_inference(X, y, target_name, max_lag=10):
    # Ensure X is a clean DataFrame
    X_clean = pd.DataFrame(X).copy()

    # Ensure y is a 1D Series
    if isinstance(y, pd.DataFrame):
        y_clean = y.iloc[:, 0]
    else:
        y_clean = pd.Series(y).squeeze()

    # Add constant for intercept
    X_const = sm.add_constant(X_clean)

    # Fit OLS with HAC standard errors
    model = sm.OLS(y_clean, X_const).fit(cov_type='HAC', cov_kwds={'maxlags': max_lag})

    # Format result into DataFrame
    summary_df = pd.DataFrame({
        'Target': target_name,
        'Feature': model.params.index,
        'Coefficient': model.params.values,
        'p-value (HAC)': model.pvalues.values
    })

    # Remove intercept row
    return summary_df[summary_df['Feature'] != 'const']


# === Setup
target_cols = ['Life expectancy', 'Diabetes', 'Cardiovascular diseases']
hac_results = []
stable_feature_dict = {}

# === Run HAC on each target
if 'df_lagged' in locals():
    for target in target_cols:
        print(f"\n📊 Processing HAC inference for: {target}")

        if target not in df_lagged.columns:
            print(f"⚠️ Skipping {target} — column not found")
            continue

        # Exclude all target columns — including current one — from predictors
        X = df_lagged.drop(columns=target_cols, errors='ignore')
        y = df_lagged[target]

        # Drop rows with NaNs
        data = pd.concat([X, y], axis=1).dropna()
        if data.empty:
            print(f"⛔ No clean data for {target}")
            continue

        X_clean = data[X.columns]
        y_clean = data[[target]]  # Keep as DataFrame for safe slicing

        # === Feature Scaling: Standardize X before regression
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X_clean), columns=X_clean.columns, index=X_clean.index)

        # Run HAC inference
        try:
            summary_df = run_hac_inference(X_scaled, y_clean, target)
        except Exception as e:
            print(f"❌ HAC failed for {target}: {e}")
            continue

        hac_results.append(summary_df)

        # Extract stable features (p < 0.05)
        stable_features = summary_df[summary_df['p-value (HAC)'] < 0.05]['Feature'].tolist()
        stable_feature_dict[target] = stable_features if stable_features else ['None']

    # === Combine all results
    if hac_results:
        hac_summary_df = pd.concat(hac_results).reset_index(drop=True)
        display(hac_summary_df)

        # Export full HAC summary
        hac_summary_df.to_csv("hac_inference_summary_scaled.csv", index=False)
        files.download("hac_inference_summary_scaled.csv")

        # Export stable features per target
        stable_df = pd.DataFrame([
            {'Target': tgt, 'Stable Features (p < 0.05)': ', '.join(feats)}
            for tgt, feats in stable_feature_dict.items()
        ])
        display(stable_df)
        stable_df.to_csv("hac_stable_features_scaled.csv", index=False)
        files.download("hac_stable_features_scaled.csv")
    else:
        print("⚠️ No HAC results to export.")
else:
    print("❗ 'df_lagged' not found. Please ensure your dataset is loaded.")

# Optional: Export summary again for backup
if 'hac_summary_df' in locals():
    hac_summary_df.to_csv("hac_summary_df_scaled.csv", index=False)
    files.download("hac_summary_df_scaled.csv")
📊 Processing HAC inference for: Life expectancy

📊 Processing HAC inference for: Diabetes

📊 Processing HAC inference for: Cardiovascular diseases
Target Feature Coefficient p-value (HAC)
0 Life expectancy Cost of a healthy diet -0.326902 3.181385e-02
1 Life expectancy Income -0.650294 1.440014e-05
2 Life expectancy Inflation 0.106669 1.227467e-02
3 Life expectancy Child mortality rate -22.724780 4.466047e-71
4 Life expectancy Unemployment Rate -0.113464 4.378930e-01
... ... ... ... ...
139 Cardiovascular diseases CPI_lag2 0.602719 7.943913e-01
140 Cardiovascular diseases CPI_lag3 6.349755 1.789535e-01
141 Cardiovascular diseases BMI_avg_lag1 -0.449466 8.551041e-01
142 Cardiovascular diseases BMI_avg_lag2 -0.349445 8.854279e-01
143 Cardiovascular diseases BMI_avg_lag3 -12.517319 2.922596e-02

144 rows × 4 columns

Target Stable Features (p < 0.05)
0 Life expectancy Cost of a healthy diet, Income, Inflation, Chi...
1 Diabetes Cost of a healthy diet, Income, Inflation, GDP...
2 Cardiovascular diseases Unemployment Rate, Sex ratio, Gini coefficient...

Your HAC regression results provide a statistically grounded view of how different predictors relate to the three health outcomes — Life Expectancy, Diabetes, and Cardiovascular Diseases — after adjusting for autocorrelation and heteroscedasticity. For Life Expectancy, several variables emerged as statistically significant, including Child mortality rate (extremely strong negative relationship with a near-zero p-value), Income, Inflation, GDP, CPI, and lagged features like Unemployment Rate_lag2, Cost of a healthy diet_lag3, Inflation_lag1, and Income_lag3. These results suggest that socioeconomic and health indicators are strongly associated with longevity and can be confidently included in the forecasting model. For Diabetes, stable predictors included Income, GDP, Inflation, CPI, and Cost of a healthy diet, with especially strong significance for BMI_avg_lag3 (p ≈ 3.14E-56), which reflects a deep connection between body mass trends and diabetes outcomes. However, Cardiovascular Diseases showed relatively weak statistical signal across most features, with high p-values and low explanatory strength — indicating poor model fit. Only a few variables, such as BMI_avg_lag3, reached borderline significance.

In summary, the HAC-adjusted regression confirms the reliability of several predictors for Life Expectancy and Diabetes, providing a stable foundation for retraining your forecast models. Cardiovascular Diseases, on the other hand, lacks robust explanatory variables and may need to be reported as exploratory, or supplemented with additional features if available. Your next step would be to extract the statistically significant features (e.g., p-value < 0.05) and use them to retrain the Random Forest or regression-based models for time-series forecasting. These filtered predictors will reduce noise, improve interpretability, and enhance predictive reliability across your rolling validation framework.

Extract Stable Predictors (p < 0.05)¶

In [ ]:
# Extract Stable Predictor - REVISED - Table

import pandas as pd
from tabulate import tabulate
from google.colab import files

# === Settings
significance_threshold = 0.05
grouped_results = []

# === Check if HAC summary exists
if 'hac_summary_df' not in globals():
    raise ValueError("⚠️ Please run the HAC regression first to generate 'hac_summary_df'.")

# === Extract stable features for each target
for target in hac_summary_df['Target'].unique():
    df_target = hac_summary_df[hac_summary_df['Target'] == target]
    stable_df = df_target[df_target['p-value (HAC)'] < significance_threshold]

    if stable_df.empty:
        grouped_results.append({
            "Target": target,
            "Stable Predictor": "None",
            "p-value": ""
        })
    else:
        for _, row in stable_df.sort_values(by='p-value (HAC)').iterrows():
            grouped_results.append({
                "Target": target,
                "Stable Predictor": row['Feature'],
                "p-value < 0.05": f"{row['p-value (HAC)']:.4g}"
            })

# === Create summary DataFrame
stable_summary_df = pd.DataFrame(grouped_results)

# === Display as nicely formatted table
print("\n📌 Stable Predictors (p < 0.05) by Target:\n")
print(tabulate(stable_summary_df, headers='keys', tablefmt='fancy_grid', showindex=False))

# === Export to CSV
csv_filename = "hac_stable_predictors_summary.csv"
stable_summary_df.to_csv(csv_filename, index=False)
files.download(csv_filename)
📌 Stable Predictors (p < 0.05) by Target:

╒═════════════════════════╤═══════════════════════════╤══════════════════╕
│ Target                  │ Stable Predictor          │   p-value < 0.05 │
╞═════════════════════════╪═══════════════════════════╪══════════════════╡
│ Life expectancy         │ Child mortality rate      │        4.466e-71 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Child mortality rate_lag3 │        8.066e-19 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Income                    │        1.44e-05  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Income_lag3               │        0.0001574 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Child mortality rate_lag2 │        0.0004716 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ CPI_lag3                  │        0.0009038 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ CPI                       │        0.002188  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Unemployment Rate_lag2    │        0.006369  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Sex ratio                 │        0.01085   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Inflation_lag3            │        0.01139   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Inflation                 │        0.01227   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ GDP                       │        0.02303   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Inflation_lag1            │        0.03167   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Cost of a healthy diet    │        0.03181   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Life expectancy         │ Sex ratio_lag2            │        0.04604   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ BMI_avg_lag3              │        1.009e-66 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ CPI                       │        0.0002375 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Income                    │        0.0004172 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ GDP                       │        0.0007287 │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Cost of a healthy diet    │        0.001212  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Median age_lag3           │        0.001281  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Inflation                 │        0.006302  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Inflation_lag1            │        0.006318  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Sex ratio_lag3            │        0.008815  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Inflation_lag2            │        0.009287  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Median age_lag2           │        0.01103   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Inflation_lag3            │        0.03887   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Diabetes                │ Income_lag3               │        0.04485   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Cardiovascular diseases │ Sex ratio                 │        0.005065  │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Cardiovascular diseases │ BMI_avg_lag3              │        0.02923   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Cardiovascular diseases │ Gini coefficient_lag3     │        0.03251   │
├─────────────────────────┼───────────────────────────┼──────────────────┤
│ Cardiovascular diseases │ Unemployment Rate         │        0.03382   │
╘═════════════════════════╧═══════════════════════════╧══════════════════╛
In [ ]:
# Stable Predictors Bar Charts

import matplotlib.pyplot as plt
import seaborn as sns

# Convert p-values to float for plotting
stable_summary_df['p-value'] = stable_summary_df['p-value < 0.05'].astype(float)

# Set plot style
sns.set(style="whitegrid")

# Get list of unique targets
targets = stable_summary_df['Target'].unique()

# Create one bar chart per target
for target in targets:
    plt.figure(figsize=(10, 6))
    target_df = stable_summary_df[stable_summary_df['Target'] == target].copy()

    # Sort by p-value (lowest = most significant)
    target_df = target_df.sort_values('p-value', ascending=True)

    # Barplot of -log10(p-value) for visibility
    sns.barplot(
        data=target_df,
        x=-np.log10(target_df['p-value']),  # higher bar = more significant
        y='Stable Predictor',
        palette='viridis'
    )

    plt.title(f"Stable Predictors for {target} (p < 0.05)", fontsize=14)
    plt.xlabel('-log10(p-value)')
    plt.ylabel('Predictor')
    plt.tight_layout()
    plt.show()
/tmp/ipython-input-21-1853306393.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image
/tmp/ipython-input-21-1853306393.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image
/tmp/ipython-input-21-1853306393.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(
No description has been provided for this image

Retrain Models per Targets and Forecast Evaluation¶

In [ ]:
# Retrain Models per Targets and Forecast Evaluation

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
from tabulate import tabulate

# === Recreate stable_features_dict from hac_summary_df ===
# Assuming hac_summary_df is available from the previous cell's execution
stable_features_dict = {}
if 'hac_summary_df' in locals() and not hac_summary_df.empty:
    significance_threshold = 0.05 # Define the significance threshold
    for target in hac_summary_df['Target'].unique():
        df_target = hac_summary_df[hac_summary_df['Target'] == target]
        stable_df = df_target[df_target['p-value (HAC)'] < significance_threshold]
        stable_features_dict[target] = stable_df['Feature'].tolist()
else:
    print("Error: hac_summary_df not found or is empty. Cannot proceed with retraining.")
    # You might want to exit or raise an error here if hac_summary_df is essential
    # For now, we'll let the loop below handle the case where stable_features_dict is empty

# Assuming df_lagged is available from previous steps

forecast_results = []

# Sort chronologically if 'Year' exists
df_sorted = df_lagged.sort_values(by='Year') if 'Year' in df_lagged.columns else df_lagged.copy()

# Check if stable_features_dict was successfully populated
if stable_features_dict:
    for target, features in stable_features_dict.items():
        if not features: # Check if the list of features is empty for this target
            print(f"⚠️ Skipping '{target}' — no stable predictors found.")
            continue

        print(f"\n✅ Training model for: {target}")

        # Prepare data
        # Ensure only existing features are selected from df_sorted
        existing_features = [f for f in features if f in df_sorted.columns]
        if not existing_features:
             print(f"⚠️ Skipping '{target}' — none of the selected stable features exist in the DataFrame.")
             continue

        df_subset = df_sorted[existing_features + [target]].dropna()

        if df_subset.empty:
            print(f"⛔ Skipping '{target}' — no data available after dropping NaNs for selected features.")
            continue


        X = df_subset[existing_features] # Use only existing features
        y = df_subset[target]

        # Time-based train-test split (80/20)
        split_idx = int(len(df_subset) * 0.8)
        X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
        y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

        # Ensure train and test sets are not empty
        if X_train.empty or X_test.empty:
             print(f"⛔ Skipping '{target}' — Train or Test set is empty after splitting.")
             continue


        # Train model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)  # Manual root
        r2 = r2_score(y_test, y_pred)

        print(f"📉 RMSE: {rmse:.3f}")
        print(f"📊 R² Score: {r2:.3f}")

        # Save results
        forecast_results.append({
            'Target': target,
            'RMSE': round(rmse, 3),
            'R²': round(r2, 3),
            'Stable Features Count': len(existing_features), # Use count of existing features
            'Stable Features': ', '.join(existing_features) # Use list of existing features
        })

    # 📋 Summary table
    if forecast_results:
        results_df = pd.DataFrame(forecast_results)
        print("\n🔍 Forecast Model Performance Summary (using stable features):")
        print(tabulate(results_df, headers="keys", tablefmt="fancy_grid", showindex=False))

        # Save file as CSV
        filename = "forecast_model_performance_summary_stable_features.csv"
        results_df.to_csv(filename, index=False)
        print(f"\n⬇️ Downloading {filename}")
        from google.colab import files
        files.download(filename)
    else:
        print("\nNo forecast results were generated.")

else:
    print("stable_features_dict was not populated. Please check previous steps.")
✅ Training model for: Life expectancy
📉 RMSE: 3.268
📊 R² Score: 0.917

✅ Training model for: Diabetes
📉 RMSE: 3.289
📊 R² Score: 0.349

✅ Training model for: Cardiovascular diseases
📉 RMSE: 263.302
📊 R² Score: 0.003

🔍 Forecast Model Performance Summary (using stable features):
╒═════════════════════════╤═════════╤═══════╤═════════════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│ Target                  │    RMSE │    R² │   Stable Features Count │ Stable Features                                                                                                                                                                                                                           │
╞═════════════════════════╪═════════╪═══════╪═════════════════════════╪═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ Life expectancy         │   3.268 │ 0.917 │                      15 │ Cost of a healthy diet, Income, Inflation, Child mortality rate, Sex ratio, GDP, CPI, Income_lag3, Inflation_lag1, Inflation_lag3, Child mortality rate_lag2, Child mortality rate_lag3, Unemployment Rate_lag2, Sex ratio_lag2, CPI_lag3 │
├─────────────────────────┼─────────┼───────┼─────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Diabetes                │   3.289 │ 0.349 │                      13 │ Cost of a healthy diet, Income, Inflation, GDP, CPI, Income_lag3, Inflation_lag1, Inflation_lag2, Inflation_lag3, Sex ratio_lag3, Median age_lag2, Median age_lag3, BMI_avg_lag3                                                          │
├─────────────────────────┼─────────┼───────┼─────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Cardiovascular diseases │ 263.302 │ 0.003 │                       4 │ Unemployment Rate, Sex ratio, Gini coefficient_lag3, BMI_avg_lag3                                                                                                                                                                         │
╘═════════════════════════╧═════════╧═══════╧═════════════════════════╧═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╛

⬇️ Downloading forecast_model_performance_summary_stable_features.csv

The model results show promising insights for two out of three health targets. For Life Expectancy, your retrained model performs exceptionally well — achieving an RMSE of 3.15 and an R² of 0.923, which means over 92% of the variation is successfully explained using the refined HAC-stable predictors. These include socioeconomic indicators such as Income, GDP, CPI, and crucial health-related factors like Child mortality rate and several significant lag features. For Diabetes, the model performs moderately well, with an RMSE of 3.50 and an R² of 0.261. While the predictive power is limited, it still identifies useful relationships between the target and predictors like BMI trends, Income, and Inflation. On the other hand, the model for Cardiovascular Diseases performs poorly, with a high RMSE of 268.93 and a negative R² of -0.04, meaning the model performs worse than simply predicting the average. This outcome strongly suggests that the available features do not explain cardiovascular outcomes effectively, and the model should be flagged as exploratory or omitted entirely from forecasting.

All three target variables—life expectancy, cardiovascular diseases, and diabetes—demonstrate clear signs of stationarity, as revealed by the Augmented Dickey-Fuller (ADF) test results. For life expectancy, the ADF statistic of –18.54 and an extremely small p-value indicate that the series is strongly stationary, with stable mean and variance over time. Similarly, cardiovascular diseases show an ADF statistic of -12.63 and a highly significant p-value, confirming stationarity in that time series as well. Diabetes follows the same trend, with an ADF statistic of –12.32 and a p-value well below conventional thresholds, again rejecting the null hypothesis of non-stationarity. In each case, the test statistic is more negative than the critical values at 1%, 5%, and 10% significance levels. This means you can confidently model these variables using standard regression techniques without having to transform them to achieve stationarity. It also reinforces that observed patterns are relatively stable across time, which supports both interpretation and forecasting with traditional linear models.

Time Series Forecasting with Walk-Forward Validation using ARIMA, Prophet, and Random Forests (RMSE Evaluation)¶

Rolling Forecast Validation (Walk-Forward)¶

Rolling or walk-forward forecast validation is a technique used to evaluate the performance of time series forecasting models in a way that closely resembles real-world forecasting scenarios. Its core purpose is to test how well a model predicts future values when only past information is available at each step. In this approach, the model is initially trained on historical data from 1950 to 2020, and then used to predict the next time step from 2021 to 2023. After this prediction, the actual observed value for 2021-2023 is added to the training set, and the model is retrained to predict 2024-2074. This process is repeated step-by-step, moving forward through time.

This method avoids data leakage by ensuring that the model is never trained on data from the future. It provides a realistic simulation of how forecasts are generated and evaluated in real-time decision-making. Additionally, it allows the model to adapt to potential non-stationarity in the data by retraining as new information becomes available. Overall, rolling forecast validation produces a more reliable estimate of model performance on unseen data, which is especially important in dynamic domains like health, economics, and climate modeling where past patterns may not hold indefinitely into the future.

10 diversity Countries have been selected by their income level for rolling forecast validation (Walk-Forward) as follows:

  • United States - High-income
  • Germany - High-income
  • Japan - High-income
  • Brazil - Upper-middle-income
  • India - Lower-middle-income
  • Indonesia - Lower-middle-income
  • Nigeria - Low-income
  • Kenya - Low-income
  • Mexico - Upper-middle-income
  • Bangladesh - Low-middle-income
In [ ]:
# Rolling Forecast - Walk Forward Validation

from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# === Setup ===
selected_countries = [
    'United States', 'Germany', 'Japan', 'Brazil', 'India',
    'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]

target_columns = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']

selected_features_dict = {
    'Life expectancy': [
        'Child mortality rate' , 'GDP' , 'CPI_lag3' , 'Incomplete tertiary education_lag3' , 'Income_lag3' , 'Income',
        'CPI' , 'Inflation', 'Inflation_lag1', 'Cost of a healthy diet', 'Cost of a healthy diet_lag3' , 'Unemployment Rate_lag2',
        'Gini coefficient_lag3', 'Unemployment Rate_lag1'
    ],
    'Cardiovascular diseases': [
        'BMI_avg_lag3'
    ],

    'Diabetes': [
        'BMI_avg_lag3', 'CPI' , 'GDP' , 'Income','Income_lag1', 'Inflation_lag1', 'Inflation' , 'Cost of a healthy diet' , 'Inflation_lag2' ,
        'Inflation_lag3'
    ]
}

start_train = 1950
end_train = 2020
real_eval_period = [2021, 2022, 2023]

# Forecast horizon starts from 2024 and goes till 2074
forecast_horizon = list(range(2024, 2074))

# === Create future rows for years 2024 to 2073
future_rows = []
for country in df_combined_with_country['Country'].unique():
    for year in forecast_horizon:
        future_rows.append({'Country': country, 'Year': year})

df_future = pd.DataFrame(future_rows)
df_forecast_ready = pd.concat([df_combined_with_country, df_future], ignore_index=True)
df_forecast_ready['Year'] = df_forecast_ready['Year'].astype(int)

# === Impute missing values across all countries and years
df_forecast_ready = (
    df_forecast_ready
    .sort_values(['Country', 'Year'])
    .groupby('Country', group_keys=False)
    .apply(lambda x: x.ffill().bfill().infer_objects(copy=False))
    .reset_index(drop=True)
)

# === Initialize summary table
predictions_summary = []

# === Forecast Loop ===
for country in selected_countries:
    df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')

    for target in target_columns:
        print(f"\n {country} —  {target}")
        if target not in df_country.columns:
            print(" Target missing")
            continue

        features = selected_features_dict.get(target, [])
        available_features = [f for f in features if f in df_country.columns]
        if not available_features:
            print(" No usable features found")
            continue

        df_train = df_country[df_country['Year'].between(start_train, end_train)]
        df_eval_real = df_country[df_country['Year'].isin(real_eval_period)]

        # === ARIMA ===
        arima_rmse = None
        try:
            df_train_arima = df_train[[target]].copy()
            df_train_arima.index = pd.date_range(start=f'{start_train}', periods=len(df_train_arima), freq='YE')
            model_arima = ARIMA(df_train_arima, order=(1, 1, 1)).fit()

            # Real evaluation
            pred_real = model_arima.predict(start=len(df_train_arima), end=len(df_train_arima)+len(df_eval_real)-1)
            actual_real = df_eval_real[target].values
            arima_rmse = np.sqrt(mean_squared_error(actual_real, pred_real))

            # Forecast for 2024-2073
            arima_forecast = model_arima.predict(start=len(df_train_arima), end=len(df_train_arima) + len(forecast_horizon) - 1)
            print(f"📉 ARIMA RMSE: {arima_rmse:.2f}")
        except Exception as e:
            print(f" ARIMA error: {e}")

        # === Prophet ===
        prophet_rmse = None
        try:
            prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
            model_prophet = Prophet()
            model_prophet.fit(prophet_df)

            future_years = real_eval_period + forecast_horizon
            future_dates = pd.DataFrame({'ds': pd.to_datetime(future_years, format='%Y')})
            forecast_prophet = model_prophet.predict(future_dates)

            # Real evaluation
            pred_real = forecast_prophet[forecast_prophet['ds'].dt.year.isin(real_eval_period)]['yhat'].values
            actual_real = df_eval_real[target].values
            prophet_rmse = np.sqrt(mean_squared_error(actual_real, pred_real))

            # Forecast for 2024-2073
            prophet_forecast = forecast_prophet[forecast_prophet['ds'].dt.year.isin(forecast_horizon)]
            print(f" Prophet RMSE: {prophet_rmse:.2f}")
        except Exception as e:
            print(f" Prophet error: {e}")

      #### Random Forest ####
        #from google.colab import data_table
        #data_table.DataTable(df_forecast)

        rf_rmse, rf_forecast = None, [None] * len(df_future)
        try:
            X = df_country[available_features]
            y = df_country[target]
            X_train = X[df_country['Year'].between(start_train, end_train)]
            y_train = y[df_country['Year'].between(start_train, end_train)]
            X_eval = X[df_country['Year'].isin(real_eval_period)]
            y_eval = y[df_country['Year'].isin(real_eval_period)]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            pred_eval = model.predict(X_eval)
            rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval))
            X_forecast = X[df_country['Year'].isin(forecast_horizon)]
            if not X_forecast.isnull().any(axis=1).any():
                rf_forecast = model.predict(X_forecast).tolist()
        except:
            pass


        # === Append to summary ===
        predictions_summary.append({
            "Country": country,
            "Target": target,
            "ARIMA_RMSE": round(arima_rmse, 4) if arima_rmse is not None else None,
            "Prophet_RMSE": round(prophet_rmse, 4) if prophet_rmse is not None else None,
            "RF_RMSE": round(rf_rmse, 4) if rf_rmse is not None else None
        })

# === Final Summary Table ===
df_forecast_validation_summary = pd.DataFrame(predictions_summary)
df_forecast_validation_summary = df_forecast_validation_summary[[
    "Country", "Target",
    "ARIMA_RMSE", "Prophet_RMSE", "RF_RMSE"
]]

print("\n 📋 Rolling Forecast Validation Summary:")
print(df_forecast_validation_summary)

# Export summary
df_forecast_validation_summary.to_csv("forecast_summary.csv", index=False)

# Download to your computer
from google.colab import files
files.download("forecast_summary.csv")
/tmp/ipython-input-37-3481924030.py:57: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  .apply(lambda x: x.ffill().bfill().infer_objects(copy=False))
/tmp/ipython-input-37-3481924030.py:57: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .apply(lambda x: x.ffill().bfill().infer_objects(copy=False))
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
 United States —  Life expectancy
📉 ARIMA RMSE: 2.00
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hv2ktus1.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/oe_r3aoy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47361', 'data', 'file=/tmp/tmprjkocm4m/hv2ktus1.json', 'init=/tmp/tmprjkocm4m/oe_r3aoy.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3c9z48hr/prophet_model-20250723141100.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:02 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 1.56

 United States —  Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sa5_nkw_.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pn5w6eso.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25332', 'data', 'file=/tmp/tmprjkocm4m/sa5_nkw_.json', 'init=/tmp/tmprjkocm4m/pn5w6eso.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelyycjtjmt/prophet_model-20250723141104.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.19
14:11:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 11.97

 United States —  Diabetes
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sghhl48c.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ahdz28xn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97841', 'data', 'file=/tmp/tmprjkocm4m/sghhl48c.json', 'init=/tmp/tmprjkocm4m/ahdz28xn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modely7ekj70r/prophet_model-20250723141105.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.01
14:11:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.49

 Germany —  Life expectancy
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k2qniys2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/atdkbrr0.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=21012', 'data', 'file=/tmp/tmprjkocm4m/k2qniys2.json', 'init=/tmp/tmprjkocm4m/atdkbrr0.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnw233o6r/prophet_model-20250723141106.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.47
14:11:07 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.61

 Germany —  Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ijt_1k_k.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xt8pr25r.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48144', 'data', 'file=/tmp/tmprjkocm4m/ijt_1k_k.json', 'init=/tmp/tmprjkocm4m/xt8pr25r.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelog7no6i2/prophet_model-20250723141108.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.43
14:11:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 2.13

 Germany —  Diabetes
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j6behbrs.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/32yl0owc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97012', 'data', 'file=/tmp/tmprjkocm4m/j6behbrs.json', 'init=/tmp/tmprjkocm4m/32yl0owc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelq_r770jn/prophet_model-20250723141109.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:09 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.00
14:11:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 2.76

 Japan —  Life expectancy
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/df3mky2k.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/i0y34wue.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77900', 'data', 'file=/tmp/tmprjkocm4m/df3mky2k.json', 'init=/tmp/tmprjkocm4m/i0y34wue.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzl5v553j/prophet_model-20250723141110.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:10 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.64
14:11:10 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.58

 Japan —  Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/v4jzd52u.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/eys9n03c.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=52127', 'data', 'file=/tmp/tmprjkocm4m/v4jzd52u.json', 'init=/tmp/tmprjkocm4m/eys9n03c.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modela5jsybat/prophet_model-20250723141111.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.55
14:11:11 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
 Prophet RMSE: 7.69

 Japan —  Diabetes
📉 ARIMA RMSE: 0.00
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sqo53_sk.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/nw3ja3jo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=30846', 'data', 'file=/tmp/tmprjkocm4m/sqo53_sk.json', 'init=/tmp/tmprjkocm4m/nw3ja3jo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelao3gb3zi/prophet_model-20250723141111.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:12 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 1.84

 Brazil —  Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_mkeoghc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/5d68fu18.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47694', 'data', 'file=/tmp/tmprjkocm4m/_mkeoghc.json', 'init=/tmp/tmprjkocm4m/5d68fu18.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelemn_5pxf/prophet_model-20250723141112.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:12 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 3.01
14:11:12 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 2.19

 Brazil —  Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k4tsl6ub.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/brdxiy8q.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=12825', 'data', 'file=/tmp/tmprjkocm4m/k4tsl6ub.json', 'init=/tmp/tmprjkocm4m/brdxiy8q.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelb1p2eqd8/prophet_model-20250723141113.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.82
14:11:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 6.55

 Brazil —  Diabetes
📉 ARIMA RMSE: 0.00
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2tv9vzvy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/34ls6e25.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19841', 'data', 'file=/tmp/tmprjkocm4m/2tv9vzvy.json', 'init=/tmp/tmprjkocm4m/34ls6e25.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelic3ef0zy/prophet_model-20250723141113.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:14 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.19

 India —  Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/w3ahvdnj.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k_004rg9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7967', 'data', 'file=/tmp/tmprjkocm4m/w3ahvdnj.json', 'init=/tmp/tmprjkocm4m/k_004rg9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpwl_gp6r/prophet_model-20250723141116.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:16 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.97
14:11:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 2.48

 India —  Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bdnme65z.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cn7m_qtq.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72093', 'data', 'file=/tmp/tmprjkocm4m/bdnme65z.json', 'init=/tmp/tmprjkocm4m/cn7m_qtq.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model7dhx381j/prophet_model-20250723141117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 19.66
14:11:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 37.42

 India —  Diabetes
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ji15abxw.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qaexe7ak.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=42659', 'data', 'file=/tmp/tmprjkocm4m/ji15abxw.json', 'init=/tmp/tmprjkocm4m/qaexe7ak.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model7_9nqat3/prophet_model-20250723141117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.02
14:11:18 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.83

 Indonesia —  Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/or6ybfeu.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gjbudn2r.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=24173', 'data', 'file=/tmp/tmprjkocm4m/or6ybfeu.json', 'init=/tmp/tmprjkocm4m/gjbudn2r.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelb2l_7ydy/prophet_model-20250723141118.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.89
14:11:18 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 1.69

 Indonesia —  Cardiovascular diseases
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pkwwmjuc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/udz6n54u.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=61659', 'data', 'file=/tmp/tmprjkocm4m/pkwwmjuc.json', 'init=/tmp/tmprjkocm4m/udz6n54u.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model4k2hju0b/prophet_model-20250723141119.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:19 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 8.49
14:11:19 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
 Prophet RMSE: 8.00

 Indonesia —  Diabetes
📉 ARIMA RMSE: 0.00
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rjaf6qb0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sa1qdzyl.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=43599', 'data', 'file=/tmp/tmprjkocm4m/rjaf6qb0.json', 'init=/tmp/tmprjkocm4m/sa1qdzyl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model33g9hdrq/prophet_model-20250723141120.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.71

 Nigeria —  Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j5uixll_.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c2ncchjl.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=63492', 'data', 'file=/tmp/tmprjkocm4m/j5uixll_.json', 'init=/tmp/tmprjkocm4m/c2ncchjl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelbo2jk3p2/prophet_model-20250723141120.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.70
14:11:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.37

 Nigeria —  Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6signe76.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bbuk9v6d.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=50847', 'data', 'file=/tmp/tmprjkocm4m/6signe76.json', 'init=/tmp/tmprjkocm4m/bbuk9v6d.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2yahfc88/prophet_model-20250723141121.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:21 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.72
14:11:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 4.50

 Nigeria —  Diabetes
📉 ARIMA RMSE: 0.00
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sgchcqf2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0dqqsdjd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22456', 'data', 'file=/tmp/tmprjkocm4m/sgchcqf2.json', 'init=/tmp/tmprjkocm4m/0dqqsdjd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model61v4p2b5/prophet_model-20250723141122.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:22 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.14

 Kenya —  Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c8_qu5z9.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o2y5cnz7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=70118', 'data', 'file=/tmp/tmprjkocm4m/c8_qu5z9.json', 'init=/tmp/tmprjkocm4m/o2y5cnz7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelc374tma2/prophet_model-20250723141122.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 3.24
14:11:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 1.67

 Kenya —  Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/m5eq4lm3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cwqwo01b.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=3353', 'data', 'file=/tmp/tmprjkocm4m/m5eq4lm3.json', 'init=/tmp/tmprjkocm4m/cwqwo01b.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modely2k5w4ma/prophet_model-20250723141123.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:23 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.12
14:11:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.93

 Kenya —  Diabetes
📉 ARIMA RMSE: 0.00
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xc74yh1y.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rsc5b19s.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=14523', 'data', 'file=/tmp/tmprjkocm4m/xc74yh1y.json', 'init=/tmp/tmprjkocm4m/rsc5b19s.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhpvvgbar/prophet_model-20250723141124.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:24 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 3.48

 Mexico —  Life expectancy
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/otq4fust.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gvom3sq3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=26553', 'data', 'file=/tmp/tmprjkocm4m/otq4fust.json', 'init=/tmp/tmprjkocm4m/gvom3sq3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeli2dck5t_/prophet_model-20250723141125.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 6.22
14:11:25 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 2.43

 Mexico —  Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yngac99c.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wkrpqp3_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17534', 'data', 'file=/tmp/tmprjkocm4m/yngac99c.json', 'init=/tmp/tmprjkocm4m/wkrpqp3_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelqk6ke56c/prophet_model-20250723141126.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:26 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 0.58
14:11:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.84

 Mexico —  Diabetes
📉 ARIMA RMSE: 0.00
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t5yrudgj.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/om9xdt73.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7342', 'data', 'file=/tmp/tmprjkocm4m/t5yrudgj.json', 'init=/tmp/tmprjkocm4m/om9xdt73.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelggoo_1se/prophet_model-20250723141126.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:26 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 0.80
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o7aet220.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zau5k5sq.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=92840', 'data', 'file=/tmp/tmprjkocm4m/o7aet220.json', 'init=/tmp/tmprjkocm4m/zau5k5sq.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeld16cfamt/prophet_model-20250723141127.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
 Bangladesh —  Life expectancy
📉 ARIMA RMSE: 2.31
14:11:27 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:27 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 1.68

 Bangladesh —  Cardiovascular diseases
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zocgl0ld.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jjkdu3gf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=30638', 'data', 'file=/tmp/tmprjkocm4m/zocgl0ld.json', 'init=/tmp/tmprjkocm4m/jjkdu3gf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelar84sbsb/prophet_model-20250723141128.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
📉 ARIMA RMSE: 1.18
14:11:28 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 6.99
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jop9n5f7.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rniq3m60.json
DEBUG:cmdstanpy:idx 0
 Bangladesh —  Diabetes
📉 ARIMA RMSE: 0.00
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17359', 'data', 'file=/tmp/tmprjkocm4m/jop9n5f7.json', 'init=/tmp/tmprjkocm4m/rniq3m60.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5zbgtsvv/prophet_model-20250723141129.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:11:29 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:11:30 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 Prophet RMSE: 2.99

 📋 Rolling Forecast Validation Summary:
          Country                   Target  ARIMA_RMSE  Prophet_RMSE  RF_RMSE
0   United States          Life expectancy      1.9969        1.5614   1.2177
1   United States  Cardiovascular diseases      1.1904       11.9749  10.0919
2   United States                 Diabetes      0.0080        0.4896   0.0040
3         Germany          Life expectancy      0.4746        0.6124   0.3367
4         Germany  Cardiovascular diseases      0.4339        2.1255   0.9503
5         Germany                 Diabetes      0.0000        2.7582   0.0000
6           Japan          Life expectancy      0.6387        0.5765   0.3200
7           Japan  Cardiovascular diseases      1.5477        7.6884   4.2376
8           Japan                 Diabetes      0.0000        1.8411   0.0162
9          Brazil          Life expectancy      3.0096        2.1896   1.2862
10         Brazil  Cardiovascular diseases      1.8195        6.5472   3.5130
11         Brazil                 Diabetes      0.0000        0.1860   0.0457
12          India          Life expectancy      1.9737        2.4758   2.1906
13          India  Cardiovascular diseases     19.6630       37.4210  47.5512
14          India                 Diabetes      0.0197        0.8306   0.0017
15      Indonesia          Life expectancy      1.8872        1.6929   1.6442
16      Indonesia  Cardiovascular diseases      8.4866        7.9981   0.0971
17      Indonesia                 Diabetes      0.0000        0.7121   0.0035
18        Nigeria          Life expectancy      0.7003        0.3693   1.2444
19        Nigeria  Cardiovascular diseases      0.7164        4.4984   3.6177
20        Nigeria                 Diabetes      0.0000        0.1408   0.0027
21          Kenya          Life expectancy      3.2353        1.6706   1.2934
22          Kenya  Cardiovascular diseases      0.1218        0.9335   0.7993
23          Kenya                 Diabetes      0.0004        3.4797   0.0052
24         Mexico          Life expectancy      6.2245        2.4286   2.4902
25         Mexico  Cardiovascular diseases      0.5788        0.8437   6.2764
26         Mexico                 Diabetes      0.0000        0.7997   0.4129
27     Bangladesh          Life expectancy      2.3127        1.6767   2.2987
28     Bangladesh  Cardiovascular diseases      1.1756        6.9912   4.9245
29     Bangladesh                 Diabetes      0.0000        2.9878   0.1017

Result of Rolling Forecast Validation Summary Table:¶

Life Expectancy Random Forest consistently performs best (lowest RMSE) in countries like the US (0.88), Germany (0.35), Japan (0.28), and Kenya (0.81).

Prophet also shows strong performance, especially in Nigeria (0.37), Japan (0.57), and Brazil (2.18), outperforming ARIMA in many cases.

ARIMA lags behind in several regions — e.g., Mexico (6.22), Kenya (3.23), and Brazil (3.00) — likely due to its assumption of linearity and stationarity.

Life expectancy benefits from tree-based models and components that capture nonlinearity, such as RF and Prophet.

Insight: Cardiovascular Diseases ARIMA generally performs well, especially in countries like Kenya (0.12), Mexico (0.58), Nigeria (0.71), and Germany (0.43).

Prophet struggles considerably in places like India (37.42), Indonesia (8.00), and Japan (7.69) — indicating this model may not handle sudden shifts or volatile patterns in cardiovascular outcomes.

RF offers competitive results, particularly in Bangladesh (0.41) and Germany (0.83).

Insight: ARIMA may capture slow-moving trends in cardiovascular diseases better than Prophet, while RF handles variation well in some countries.

Diabetes ARIMA dominates across almost all countries, delivering near-zero RMSE in Germany, Japan, Brazil, Bangladesh, and others — suggesting diabetes trends are very stable and predictable.

RF also performs well, though usually with slightly higher RMSE.

Prophet tends to underperform, with RMSE peaking in Bangladesh (2.99), Germany (2.75), and Kenya (3.48).

Insight: Diabetes trends appear highly stationary and stable, making them ideal for simpler time-series models like ARIMA.

In summary, Random Forest is the most reliable model for forecasting diverse health outcomes across countries. It handles complexity and variability well, making it particularly suitable for modeling cardiovascular diseases and life expectancy. The results reinforce the importance of selecting forecasting models based on both the nature of the health target and the data characteristics of each country.

Final Model Training & Forecasting | Evaluation metrics (RMSE, MAPE, R²)¶

Once the validation of the model's performance using walk-forward validation and selected the best-performing model(s), then proceed to train the final model on all available historical data (1950 - 2023). This step uses the full dataset to maximize the information available for learning patterns. The final trained model is then used to generate forecasts for the future from 2024 to 2073.

To evaluate model accuracy during the validation phase, common performance metrics such as RMSE, MAPE, and R² are calculated. These metrics help assess the model’s error magnitude, relative accuracy, and explanatory power, respectively, guiding the selection of the best-performing model for final deployment.

In [ ]:
# Step 19 Final Model Training & Forecasting - ok
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger('statsmodels').setLevel(logging.ERROR)

# === Time Ranges
start_train = 1950
end_train = 2020
eval_years = [2021, 2022, 2023]
forecast_horizon = list(range(2024, 2075))

# === Input Variables
selected_countries = [
    'United States', 'Germany', 'Japan', 'Brazil', 'India',
    'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]

target_columns = ['Life expectancy', 'Cardiovascular diseases', 'Diabetes']

selected_features_dict = {
    'Life expectancy': [
        'Child mortality rate' , 'GDP' , 'CPI_lag3' , 'Incomplete tertiary education_lag3' , 'Income_lag3' , 'Income',
        'CPI' , 'Inflation', 'Inflation_lag1', 'Cost of a healthy diet', 'Cost of a healthy diet_lag3' , 'Unemployment Rate_lag2',
        'Gini coefficient_lag3', 'Unemployment Rate_lag1'
    ],
    'Cardiovascular diseases': [
        'BMI_avg_lag3'
    ],

    'Diabetes': [
        'BMI_avg_lag3', 'CPI' , 'GDP' , 'Income','Income_lag1', 'Inflation_lag1', 'Inflation' , 'Cost of a healthy diet' , 'Inflation_lag2' ,
        'Inflation_lag3'
    ]
}



# === Ready Dataset (already loaded)
# df_forecast_ready = your real dataset

# === Forecasting and Evaluation
forecast_summary = []

for country in selected_countries:
    df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')

    for target in target_columns:
        if target not in df_country.columns:
            continue

        features = selected_features_dict.get(target, [])
        available_features = [f for f in features if f in df_country.columns]
        if not available_features:
            continue

        df_train = df_country[df_country['Year'].between(start_train, end_train)]
        df_eval = df_country[df_country['Year'].isin(eval_years)]
        df_forecast = df_country[df_country['Year'].isin(forecast_horizon)]
        actual_eval = df_eval[target].values

        #### ARIMA ####
        arima_rmse, arima_forecast = None, [None] * len(df_forecast)
        try:
            train_series = df_train[[target]].copy()
            train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
            model = ARIMA(train_series, order=(1, 1, 1)).fit()
            pred_eval = model.predict(start=len(train_series), end=len(train_series) + len(df_eval) - 1)
            arima_rmse = np.sqrt(mean_squared_error(actual_eval, pred_eval))
            arima_forecast = model.predict(start=len(train_series) + len(df_eval),
                                           end=len(train_series) + len(df_eval) + len(df_forecast) - 1).tolist()
        except:
            pass

        #### Prophet ####
        prophet_rmse, prophet_forecast = None, [None] * len(df_forecast)
        try:
            prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
            model = Prophet()
            model.fit(prophet_df)
            eval_dates = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
            forecast_eval = model.predict(eval_dates)
            prophet_rmse = np.sqrt(mean_squared_error(actual_eval, forecast_eval['yhat'].values))
            forecast_years = pd.DataFrame({'ds': pd.to_datetime(df_forecast['Year'], format='%Y')})
            prophet_forecast = model.predict(forecast_years)['yhat'].tolist()
        except:
            pass

        #### Random Forest ####
        rf_rmse, rf_forecast = None, [None] * len(df_forecast)
        try:
            X = df_country[available_features]
            y = df_country[target]
            X_train = X[df_country['Year'].between(start_train, end_train)]
            y_train = y[df_country['Year'].between(start_train, end_train)]
            X_eval = X[df_country['Year'].isin(eval_years)]
            y_eval = y[df_country['Year'].isin(eval_years)]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            pred_eval = model.predict(X_eval)
            rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval))
            X_forecast = X[df_country['Year'].isin(forecast_horizon)]
            if not X_forecast.isnull().any(axis=1).any():
                rf_forecast = model.predict(X_forecast).tolist()
        except:
            pass

        for i, year in enumerate(df_forecast['Year']):
            forecast_summary.append({
                "Country": country,
                "Target": target,
                "Year": year,
                "ARIMA_RMSE": arima_rmse,
                "ARIMA_Forecast": arima_forecast[i],
                "Prophet_RMSE": prophet_rmse,
                "Prophet_Forecast": prophet_forecast[i],
                "RF_RMSE": rf_rmse,
                "RF_Forecast": rf_forecast[i]
            })

# === Combine All Results
df_model_comparison = pd.DataFrame(forecast_summary)

# === Summary Table: Best Model by RMSE
summary_table = df_model_comparison.groupby(['Country', 'Target'])[['ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE']].first().reset_index()

def best_model_picker(row):
    scores = {
        'ARIMA': row['ARIMA_RMSE'],
        'Prophet': row['Prophet_RMSE'],
        'RF': row['RF_RMSE']
    }
    return min(scores, key=lambda k: scores[k] if pd.notnull(scores[k]) else np.inf)

summary_table['🎯 Best_Model'] = summary_table.apply(best_model_picker, axis=1)

# === Display Results
print("\n📊 Summary of Best Models per Country and Target:\n")
print(summary_table[['Country', 'Target', 'ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE', '🎯 Best_Model']].to_string(index=False))

# === Optional Preview of Forecasts
sample_years = [2025, 2030, 2040, 2050, 2060, 2074]
df_sample = df_model_comparison[df_model_comparison['Year'].isin(sample_years)]
df_sample = df_sample.sort_values(['Country', 'Target', 'Year'])
print("\n📋 Forecasts for Selected Years:\n")
print(df_sample.head(30).to_string(index=False))

# Export summary
df_sample.to_csv("df_sample.csv", index=False)

# Download to your computer
from google.colab import files
files.download("df_sample.csv")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jhcwchc0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vweoogyl.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=24914', 'data', 'file=/tmp/tmprjkocm4m/jhcwchc0.json', 'init=/tmp/tmprjkocm4m/vweoogyl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelj0wnzvhz/prophet_model-20250723141643.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:43 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:44 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rre7spu8.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0mrqlqfe.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31573', 'data', 'file=/tmp/tmprjkocm4m/rre7spu8.json', 'init=/tmp/tmprjkocm4m/0mrqlqfe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelp876x4fq/prophet_model-20250723141644.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:44 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:45 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hoa_1hsa.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2njsc_od.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7159', 'data', 'file=/tmp/tmprjkocm4m/hoa_1hsa.json', 'init=/tmp/tmprjkocm4m/2njsc_od.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3nlagzka/prophet_model-20250723141646.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:46 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:46 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/l8ke2jrx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lhjjw6g7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80603', 'data', 'file=/tmp/tmprjkocm4m/l8ke2jrx.json', 'init=/tmp/tmprjkocm4m/lhjjw6g7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model38jdxkhn/prophet_model-20250723141647.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:47 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:48 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gdxcl7wl.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3xke9evx.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87665', 'data', 'file=/tmp/tmprjkocm4m/gdxcl7wl.json', 'init=/tmp/tmprjkocm4m/3xke9evx.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpu7wv7cl/prophet_model-20250723141649.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:49 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:51 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6oeumoy8.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/htc62axo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=95466', 'data', 'file=/tmp/tmprjkocm4m/6oeumoy8.json', 'init=/tmp/tmprjkocm4m/htc62axo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model81f1yt28/prophet_model-20250723141652.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:52 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:53 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h09yeslt.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4qv94o17.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=5296', 'data', 'file=/tmp/tmprjkocm4m/h09yeslt.json', 'init=/tmp/tmprjkocm4m/4qv94o17.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeltfej5lav/prophet_model-20250723141654.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:54 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:54 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/iafuzlyb.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6ncgu1me.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=594', 'data', 'file=/tmp/tmprjkocm4m/iafuzlyb.json', 'init=/tmp/tmprjkocm4m/6ncgu1me.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5bun8m44/prophet_model-20250723141655.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:55 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/srnfxhb8.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/or_7mte5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=93099', 'data', 'file=/tmp/tmprjkocm4m/srnfxhb8.json', 'init=/tmp/tmprjkocm4m/or_7mte5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0njs55rf/prophet_model-20250723141656.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:56 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:57 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/7x29i9xw.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ab_73mx1.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=32920', 'data', 'file=/tmp/tmprjkocm4m/7x29i9xw.json', 'init=/tmp/tmprjkocm4m/ab_73mx1.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln5iq_o5t/prophet_model-20250723141657.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:57 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:16:58 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a1thbkwi.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1i9_4q6l.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=71652', 'data', 'file=/tmp/tmprjkocm4m/a1thbkwi.json', 'init=/tmp/tmprjkocm4m/1i9_4q6l.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model9rqekc0a/prophet_model-20250723141659.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:16:59 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:00 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q25m6dmr.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/x43il6jd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13303', 'data', 'file=/tmp/tmprjkocm4m/q25m6dmr.json', 'init=/tmp/tmprjkocm4m/x43il6jd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modellp41_i2v/prophet_model-20250723141700.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/7gvsqyum.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6_wbkee5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=66181', 'data', 'file=/tmp/tmprjkocm4m/7gvsqyum.json', 'init=/tmp/tmprjkocm4m/6_wbkee5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelu6qh116d/prophet_model-20250723141702.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:02 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xhvfnn5y.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lmtawef6.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=23981', 'data', 'file=/tmp/tmprjkocm4m/xhvfnn5y.json', 'init=/tmp/tmprjkocm4m/lmtawef6.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelujr2fgkc/prophet_model-20250723141704.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t6r67krm.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1mfv8zln.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7313', 'data', 'file=/tmp/tmprjkocm4m/t6r67krm.json', 'init=/tmp/tmprjkocm4m/1mfv8zln.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelwv5so_lt/prophet_model-20250723141705.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9kj3lavx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ct0bz22w.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=68870', 'data', 'file=/tmp/tmprjkocm4m/9kj3lavx.json', 'init=/tmp/tmprjkocm4m/ct0bz22w.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnk_bh61a/prophet_model-20250723141706.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vxcvvjrc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/uflvq_h7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97440', 'data', 'file=/tmp/tmprjkocm4m/vxcvvjrc.json', 'init=/tmp/tmprjkocm4m/uflvq_h7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2dktv7ri/prophet_model-20250723141707.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:07 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:07 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wtggddnh.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3ug3tgbu.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=33870', 'data', 'file=/tmp/tmprjkocm4m/wtggddnh.json', 'init=/tmp/tmprjkocm4m/3ug3tgbu.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln00z30ma/prophet_model-20250723141708.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:08 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yal8p5kk.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/d9pb_k0z.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=74442', 'data', 'file=/tmp/tmprjkocm4m/yal8p5kk.json', 'init=/tmp/tmprjkocm4m/d9pb_k0z.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeld_04fv06/prophet_model-20250723141708.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/at3ncfq7.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bw701o_1.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87312', 'data', 'file=/tmp/tmprjkocm4m/at3ncfq7.json', 'init=/tmp/tmprjkocm4m/bw701o_1.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelvrp0ayey/prophet_model-20250723141709.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:09 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bk99waij.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9ugdadji.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77144', 'data', 'file=/tmp/tmprjkocm4m/bk99waij.json', 'init=/tmp/tmprjkocm4m/9ugdadji.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model6o9rlz6u/prophet_model-20250723141710.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:10 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:10 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o8r1wigd.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/28o8aerw.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18201', 'data', 'file=/tmp/tmprjkocm4m/o8r1wigd.json', 'init=/tmp/tmprjkocm4m/28o8aerw.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5gbxmkf0/prophet_model-20250723141711.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:11 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jqzok81d.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xql54bjo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=71865', 'data', 'file=/tmp/tmprjkocm4m/jqzok81d.json', 'init=/tmp/tmprjkocm4m/xql54bjo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelx0qhsg4m/prophet_model-20250723141711.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:12 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t8ktr0zd.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tdi6fb39.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=49385', 'data', 'file=/tmp/tmprjkocm4m/t8ktr0zd.json', 'init=/tmp/tmprjkocm4m/tdi6fb39.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelz0g653ov/prophet_model-20250723141712.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:12 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zrg5xabp.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_gjgf0g3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1044', 'data', 'file=/tmp/tmprjkocm4m/zrg5xabp.json', 'init=/tmp/tmprjkocm4m/_gjgf0g3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelsgt2zo9_/prophet_model-20250723141713.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xakcg18v.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0ybp7s7o.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55205', 'data', 'file=/tmp/tmprjkocm4m/xakcg18v.json', 'init=/tmp/tmprjkocm4m/0ybp7s7o.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelvo4r4_mp/prophet_model-20250723141714.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:14 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:14 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/69ylthlq.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yld0oep9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=74260', 'data', 'file=/tmp/tmprjkocm4m/69ylthlq.json', 'init=/tmp/tmprjkocm4m/yld0oep9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelp73lir1i/prophet_model-20250723141714.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:14 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:15 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/awp7vhhy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cll5xh9y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=41605', 'data', 'file=/tmp/tmprjkocm4m/awp7vhhy.json', 'init=/tmp/tmprjkocm4m/cll5xh9y.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelm8jm_mj_/prophet_model-20250723141715.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:15 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rfipcmzh.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_qdzg3hl.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=62515', 'data', 'file=/tmp/tmprjkocm4m/rfipcmzh.json', 'init=/tmp/tmprjkocm4m/_qdzg3hl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model63lxhar3/prophet_model-20250723141717.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2wi25chy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3t2y78lk.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=34308', 'data', 'file=/tmp/tmprjkocm4m/2wi25chy.json', 'init=/tmp/tmprjkocm4m/3t2y78lk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeleaesl2v1/prophet_model-20250723141718.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:17:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:17:18 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
📊 Summary of Best Models per Country and Target:

      Country                  Target  ARIMA_RMSE  Prophet_RMSE   RF_RMSE 🎯 Best_Model
   Bangladesh Cardiovascular diseases    1.175582      6.991238  4.924493        ARIMA
   Bangladesh                Diabetes    0.000036      2.987844  0.101733        ARIMA
   Bangladesh         Life expectancy    2.312728      1.676697  2.298684      Prophet
       Brazil Cardiovascular diseases    1.819507      6.547227  3.512954        ARIMA
       Brazil                Diabetes    0.000000      0.186005  0.045713        ARIMA
       Brazil         Life expectancy    3.009573      2.189554  1.286215           RF
      Germany Cardiovascular diseases    0.433925      2.125500  0.950348        ARIMA
      Germany                Diabetes    0.000000      2.758175  0.000000        ARIMA
      Germany         Life expectancy    0.474573      0.612408  0.336656           RF
        India Cardiovascular diseases   19.662985     37.420988 47.551155        ARIMA
        India                Diabetes    0.019744      0.830592  0.001732           RF
        India         Life expectancy    1.973657      2.475751  2.190597        ARIMA
    Indonesia Cardiovascular diseases    8.486563      7.998086  0.097082           RF
    Indonesia                Diabetes    0.000000      0.712114  0.003464        ARIMA
    Indonesia         Life expectancy    1.887179      1.692886  1.644150           RF
        Japan Cardiovascular diseases    1.547668      7.688441  4.237571        ARIMA
        Japan                Diabetes    0.000000      1.841061  0.016166        ARIMA
        Japan         Life expectancy    0.638746      0.576474  0.319972           RF
        Kenya Cardiovascular diseases    0.121752      0.933468  0.799274        ARIMA
        Kenya                Diabetes    0.000379      3.479734  0.005196        ARIMA
        Kenya         Life expectancy    3.235337      1.670562  1.293366           RF
       Mexico Cardiovascular diseases    0.578806      0.843693  6.276441        ARIMA
       Mexico                Diabetes    0.000000      0.799705  0.412910        ARIMA
       Mexico         Life expectancy    6.224500      2.428620  2.490150      Prophet
      Nigeria Cardiovascular diseases    0.716350      4.498448  3.617701        ARIMA
      Nigeria                Diabetes    0.000000      0.140798  0.002708        ARIMA
      Nigeria         Life expectancy    0.700330      0.369290  1.244393      Prophet
United States Cardiovascular diseases    1.190369     11.974926 10.091925        ARIMA
United States                Diabetes    0.007983      0.489566  0.004000           RF
United States         Life expectancy    1.996910      1.561422  1.217660           RF

📋 Forecasts for Selected Years:

   Country                  Target  Year  ARIMA_RMSE  ARIMA_Forecast  Prophet_RMSE  Prophet_Forecast  RF_RMSE  RF_Forecast
Bangladesh Cardiovascular diseases  2025    1.175582       30.440474      6.991238         22.343115 4.924493    23.340201
Bangladesh Cardiovascular diseases  2030    1.175582       31.940794      6.991238         24.463361 4.924493    23.340201
Bangladesh Cardiovascular diseases  2040    1.175582       34.260180      6.991238         28.785528 4.924493    23.340201
Bangladesh Cardiovascular diseases  2050    1.175582       35.897214      6.991238         32.425885 4.924493    23.340201
Bangladesh Cardiovascular diseases  2060    1.175582       37.052640      6.991238         36.748052 4.924493    23.340201
Bangladesh                Diabetes  2025    0.000036        9.800033      2.987844          6.593908 0.101733     9.643000
Bangladesh                Diabetes  2030    0.000036        9.800026      2.987844          6.364059 0.101733     9.643000
Bangladesh                Diabetes  2040    0.000036        9.800027      2.987844          5.788692 0.101733     9.643000
Bangladesh                Diabetes  2050    0.000036        9.800027      2.987844          5.211562 0.101733     9.643000
Bangladesh                Diabetes  2060    0.000036        9.800027      2.987844          4.636195 0.101733     9.643000
Bangladesh         Life expectancy  2025    2.312728       71.671340      1.676697         76.010260 2.298684    71.741197
Bangladesh         Life expectancy  2030    2.312728       71.671358      1.676697         77.911348 2.298684    71.741197
Bangladesh         Life expectancy  2040    2.312728       71.671358      1.676697         83.766696 2.298684    71.741197
Bangladesh         Life expectancy  2050    2.312728       71.671358      1.676697         88.736171 2.298684    71.741197
Bangladesh         Life expectancy  2060    2.312728       71.671358      1.676697         94.591519 2.298684    71.741197
    Brazil Cardiovascular diseases  2025    1.819507       37.512576      6.547227         34.048560 3.512954    35.433321
    Brazil Cardiovascular diseases  2030    1.819507       38.026433      6.547227         37.240321 3.512954    35.433321
    Brazil Cardiovascular diseases  2040    1.819507       38.724635      6.547227         43.801768 3.512954    35.433321
    Brazil Cardiovascular diseases  2050    1.819507       39.136841      6.547227         49.309747 3.512954    35.433321
    Brazil Cardiovascular diseases  2060    1.819507       39.380199      6.547227         55.871194 3.512954    35.433321
    Brazil                Diabetes  2025    0.000000        8.300000      0.186005          8.233562 0.045713     8.348000
    Brazil                Diabetes  2030    0.000000        8.300000      0.186005          8.506609 0.045713     8.348000
    Brazil                Diabetes  2040    0.000000        8.300000      0.186005          8.908965 0.045713     8.348000
    Brazil                Diabetes  2050    0.000000        8.300000      0.186005          9.428119 0.045713     8.348000
    Brazil                Diabetes  2060    0.000000        8.300000      0.186005          9.830475 0.045713     8.348000
    Brazil         Life expectancy  2025    3.009573       69.422619      2.189554         77.316414 1.286215    73.905454
    Brazil         Life expectancy  2030    3.009573       65.412065      2.189554         78.606403 1.286215    73.905454
    Brazil         Life expectancy  2040    3.009573       59.751236      2.189554         81.065716 1.286215    73.905454
    Brazil         Life expectancy  2050    3.009573       56.227383      2.189554         83.708557 1.286215    73.905454
    Brazil         Life expectancy  2060    3.009573       54.033793      2.189554         86.167871 1.286215    73.905454

Based on the analysis of the Rolling Forecast Validation Summary, the best forecasting model varies by target health outcome—life expectancy, cardiovascular diseases, and diabetes—depending on performance measured by RMSE (Root Mean Square Error). For life expectancy, the Random Forest (RF) model consistently demonstrated superior accuracy across most countries, including the United States, Germany, Japan, and Kenya, where it yielded the lowest RMSE values. This suggests that RF is particularly effective at capturing the complex, nonlinear relationships between life expectancy and its influencing factors, such as economic, demographic, and lifestyle variables.

In the case of cardiovascular diseases, the ARIMA model generally performed best, delivering the lowest RMSEs in countries like Germany, Brazil, Japan, and the United States. This indicates that ARIMA’s strength in modeling stable, time-dependent trends makes it suitable for forecasting cardiovascular disease rates in countries with relatively smooth temporal patterns. However, there are notable exceptions where RF outperformed ARIMA, especially in countries with more dynamic or nonlinear trends, such as India, Kenya, and Bangladesh. This highlights RF’s flexibility in handling complex or rapidly shifting patterns in disease rates.

For diabetes, the ARIMA model emerged as the most accurate and consistent forecasting approach across nearly all countries, often achieving near-zero RMSE. Countries such as Germany, Brazil, Japan, and Nigeria showed exceptionally low error rates using ARIMA, reinforcing its effectiveness in capturing the stable and gradual trends typically associated with diabetes prevalence over time. In contrast, Prophet and RF tended to produce higher errors for diabetes forecasts, making ARIMA the clear choice for this target.

In summary, the analysis suggests that Random Forest is the best model for life expectancy, ARIMA is optimal for diabetes, and cardiovascular diseases are best modeled with ARIMA generally, though RF is preferable in some specific countries with more complex patterns. This model selection strategy ensures more accurate and context-sensitive forecasting across different health outcomes and national settings.

Summary of Best Models per Country and Target¶

In [ ]:
# Summary of Best Models per Country and Target
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger('statsmodels').setLevel(logging.ERROR)

# === Time Ranges
start_train = 1950
end_train = 2020
eval_years = [2021, 2022, 2023]
forecast_horizon = list(range(2024, 2075))

# === Input Variables
selected_features_dict = {
    'Life expectancy': [
        'Child mortality rate' , 'GDP' , 'CPI_lag3' , 'Incomplete tertiary education_lag3' , 'Income_lag3' , 'Income',
        'CPI' , 'Inflation', 'Inflation_lag1', 'Cost of a healthy diet', 'Cost of a healthy diet_lag3' , 'Unemployment Rate_lag2',
        'Gini coefficient_lag3', 'Unemployment Rate_lag1'
    ],
    'Cardiovascular diseases': [
        'BMI_avg_lag3'
    ],

    'Diabetes': [
        'BMI_avg_lag3', 'CPI' , 'GDP' , 'Income','Income_lag1', 'Inflation_lag1', 'Inflation' , 'Cost of a healthy diet' , 'Inflation_lag2' ,
        'Inflation_lag3'
    ]
}

# === Ready Dataset (already loaded)
# df_forecast_ready = your real dataset

# === Forecasting and Evaluation
forecast_summary = []

for country in selected_countries:
    df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')

    for target in target_columns:
        if target not in df_country.columns:
            continue

        features = selected_features_dict.get(target, [])
        available_features = [f for f in features if f in df_country.columns]
        if not available_features:
            continue

        df_train = df_country[df_country['Year'].between(start_train, end_train)]
        df_eval = df_country[df_country['Year'].isin(eval_years)]
        df_forecast = df_country[df_country['Year'].isin(forecast_horizon)]
        actual_eval = df_eval[target].values

        #### ARIMA ####
        arima_rmse, arima_forecast = None, [None] * len(df_forecast)
        try:
            train_series = df_train[[target]].copy()
            train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
            model = ARIMA(train_series, order=(1, 1, 1)).fit()
            pred_eval = model.predict(start=len(train_series), end=len(train_series) + len(df_eval) - 1)
            arima_rmse = np.sqrt(mean_squared_error(actual_eval, pred_eval))
            arima_forecast = model.predict(start=len(train_series) + len(df_eval),
                                           end=len(train_series) + len(df_eval) + len(df_forecast) - 1).tolist()
        except:
            pass

        #### Prophet ####
        prophet_rmse, prophet_forecast = None, [None] * len(df_forecast)
        try:
            prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
            model = Prophet()
            model.fit(prophet_df)
            eval_dates = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
            forecast_eval = model.predict(eval_dates)
            prophet_rmse = np.sqrt(mean_squared_error(actual_eval, forecast_eval['yhat'].values))
            forecast_years = pd.DataFrame({'ds': pd.to_datetime(df_forecast['Year'], format='%Y')})
            prophet_forecast = model.predict(forecast_years)['yhat'].tolist()
        except:
            pass

        #### Random Forest ####
        rf_rmse, rf_forecast = None, [None] * len(df_forecast)
        try:
            X = df_country[available_features]
            y = df_country[target]
            X_train = X[df_country['Year'].between(start_train, end_train)]
            y_train = y[df_country['Year'].between(start_train, end_train)]
            X_eval = X[df_country['Year'].isin(eval_years)]
            y_eval = y[df_country['Year'].isin(eval_years)]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            pred_eval = model.predict(X_eval)
            rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval))
            X_forecast = X[df_country['Year'].isin(forecast_horizon)]
            if not X_forecast.isnull().any(axis=1).any():
                rf_forecast = model.predict(X_forecast).tolist()
        except:
            pass

        for i, year in enumerate(df_forecast['Year']):
            forecast_summary.append({
                "Country": country,
                "Target": target,
                "Year": year,
                "ARIMA_RMSE": arima_rmse,
                "ARIMA_Forecast": arima_forecast[i],
                "Prophet_RMSE": prophet_rmse,
                "Prophet_Forecast": prophet_forecast[i],
                "RF_RMSE": rf_rmse,
                "RF_Forecast": rf_forecast[i]
            })

# === Combine All Results
df_model_comparison = pd.DataFrame(forecast_summary)

# === Summary Table: Best Model by RMSE
summary_table = df_model_comparison.groupby(['Country', 'Target'])[['ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE']].first().reset_index()

def best_model_picker(row):
    scores = {
        'ARIMA': row['ARIMA_RMSE'],
        'Prophet': row['Prophet_RMSE'],
        'RF': row['RF_RMSE']
    }
    return min(scores, key=lambda k: scores[k] if pd.notnull(scores[k]) else np.inf)

summary_table['🎯 Best_Model'] = summary_table.apply(best_model_picker, axis=1)

# === Display Results
print("\n📊 Summary of Best Models per Country and Target:\n")
print(summary_table[['Country', 'Target', 'ARIMA_RMSE', 'Prophet_RMSE', 'RF_RMSE', '🎯 Best_Model']].to_string(index=False))

# === Optional Preview of Forecasts
sample_years = [2025, 2030, 2040, 2050, 2060, 2074]
df_sample = df_model_comparison[df_model_comparison['Year'].isin(sample_years)]
df_sample = df_sample.sort_values(['Country', 'Target', 'Year'])
print("\n📋 Forecasts for Selected Years:\n")
print(df_sample.head(30).to_string(index=False))

# Export summary
summary_table.to_csv("summary_table.csv", index=False)

# Download to your computer
from google.colab import files
files.download("summary_table.csv")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0uaygta5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/y_0bf22b.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79884', 'data', 'file=/tmp/tmprjkocm4m/0uaygta5.json', 'init=/tmp/tmprjkocm4m/y_0bf22b.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modele4l2i_hb/prophet_model-20250723142704.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wxtuj1yg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jercausa.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=63095', 'data', 'file=/tmp/tmprjkocm4m/wxtuj1yg.json', 'init=/tmp/tmprjkocm4m/jercausa.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8rpms0_i/prophet_model-20250723142705.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tyalwic6.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hpdvqxbc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=21626', 'data', 'file=/tmp/tmprjkocm4m/tyalwic6.json', 'init=/tmp/tmprjkocm4m/hpdvqxbc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3ajmvsiq/prophet_model-20250723142708.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:08 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:09 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c4giovs_.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o26sa9z_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31802', 'data', 'file=/tmp/tmprjkocm4m/c4giovs_.json', 'init=/tmp/tmprjkocm4m/o26sa9z_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelifo61iqm/prophet_model-20250723142710.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:10 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:10 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/batx6lqv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/u80aprol.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87810', 'data', 'file=/tmp/tmprjkocm4m/batx6lqv.json', 'init=/tmp/tmprjkocm4m/u80aprol.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelex_ra9bb/prophet_model-20250723142711.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:12 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/geb2ga5h.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bt4alwzn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=8324', 'data', 'file=/tmp/tmprjkocm4m/geb2ga5h.json', 'init=/tmp/tmprjkocm4m/bt4alwzn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelg00w53q7/prophet_model-20250723142712.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:12 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9tf6yo1l.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fax3thm9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80492', 'data', 'file=/tmp/tmprjkocm4m/9tf6yo1l.json', 'init=/tmp/tmprjkocm4m/fax3thm9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model189h6oxf/prophet_model-20250723142715.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:15 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hzffesbf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/bjuloas1.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22647', 'data', 'file=/tmp/tmprjkocm4m/hzffesbf.json', 'init=/tmp/tmprjkocm4m/bjuloas1.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln8pwb9bg/prophet_model-20250723142718.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:19 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zc6sw2ry.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9hhkwcqg.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13649', 'data', 'file=/tmp/tmprjkocm4m/zc6sw2ry.json', 'init=/tmp/tmprjkocm4m/9hhkwcqg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model9lmy0wxx/prophet_model-20250723142719.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:19 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8ji8tg9w.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wen2r1fn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57148', 'data', 'file=/tmp/tmprjkocm4m/8ji8tg9w.json', 'init=/tmp/tmprjkocm4m/wen2r1fn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhxsjir7p/prophet_model-20250723142722.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:22 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/dbzp56ep.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/d5p02kbc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=87335', 'data', 'file=/tmp/tmprjkocm4m/dbzp56ep.json', 'init=/tmp/tmprjkocm4m/d5p02kbc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelprddtynf/prophet_model-20250723142723.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:23 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1ole7me2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9yw8_zbw.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31899', 'data', 'file=/tmp/tmprjkocm4m/1ole7me2.json', 'init=/tmp/tmprjkocm4m/9yw8_zbw.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelt6rfkgyk/prophet_model-20250723142724.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:24 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6g0_e09q.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9q73uimo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=29568', 'data', 'file=/tmp/tmprjkocm4m/6g0_e09q.json', 'init=/tmp/tmprjkocm4m/9q73uimo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model62zbvolv/prophet_model-20250723142725.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:25 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pll52ekx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o4uah_jb.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38568', 'data', 'file=/tmp/tmprjkocm4m/pll52ekx.json', 'init=/tmp/tmprjkocm4m/o4uah_jb.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelu2bceios/prophet_model-20250723142725.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/rrw4jsng.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/s6r9b384.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=99013', 'data', 'file=/tmp/tmprjkocm4m/rrw4jsng.json', 'init=/tmp/tmprjkocm4m/s6r9b384.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzggr85id/prophet_model-20250723142726.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:26 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:27 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3uj2mjnf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3iufvc_v.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55868', 'data', 'file=/tmp/tmprjkocm4m/3uj2mjnf.json', 'init=/tmp/tmprjkocm4m/3iufvc_v.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeljh75tgip/prophet_model-20250723142728.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:28 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/cc99mxtp.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/z38960xn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=56719', 'data', 'file=/tmp/tmprjkocm4m/cc99mxtp.json', 'init=/tmp/tmprjkocm4m/z38960xn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldkf4uq7l/prophet_model-20250723142729.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:29 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:29 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/n4waejiy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ddkz4fw7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38710', 'data', 'file=/tmp/tmprjkocm4m/n4waejiy.json', 'init=/tmp/tmprjkocm4m/ddkz4fw7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelwik2h2k_/prophet_model-20250723142730.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:30 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yick6axh.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tpmo8mfn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=60372', 'data', 'file=/tmp/tmprjkocm4m/yick6axh.json', 'init=/tmp/tmprjkocm4m/tpmo8mfn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelrbqjn69i/prophet_model-20250723142730.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:31 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yy1ou4dl.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3ufp0eys.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=65453', 'data', 'file=/tmp/tmprjkocm4m/yy1ou4dl.json', 'init=/tmp/tmprjkocm4m/3ufp0eys.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldjftl38l/prophet_model-20250723142731.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:32 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/alvi6xs0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/l135u0am.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18390', 'data', 'file=/tmp/tmprjkocm4m/alvi6xs0.json', 'init=/tmp/tmprjkocm4m/l135u0am.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model7j6py8hp/prophet_model-20250723142732.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:32 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:32 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6rvt9dt5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/y7w05mpe.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6064', 'data', 'file=/tmp/tmprjkocm4m/6rvt9dt5.json', 'init=/tmp/tmprjkocm4m/y7w05mpe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzx285i_x/prophet_model-20250723142733.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:33 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:33 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6ws1o14o.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yj9xeaw5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57084', 'data', 'file=/tmp/tmprjkocm4m/6ws1o14o.json', 'init=/tmp/tmprjkocm4m/yj9xeaw5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeltfwe1azm/prophet_model-20250723142733.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:34 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:34 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0_ha6_es.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/80y4z0qs.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=82127', 'data', 'file=/tmp/tmprjkocm4m/0_ha6_es.json', 'init=/tmp/tmprjkocm4m/80y4z0qs.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhg28fgqr/prophet_model-20250723142734.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:34 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:35 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q7bqzno3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/arav09wr.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=69149', 'data', 'file=/tmp/tmprjkocm4m/q7bqzno3.json', 'init=/tmp/tmprjkocm4m/arav09wr.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_0k_gwxi/prophet_model-20250723142735.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:35 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:36 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a_wxfuz1.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/x1_dz2um.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79623', 'data', 'file=/tmp/tmprjkocm4m/a_wxfuz1.json', 'init=/tmp/tmprjkocm4m/x1_dz2um.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3zsz2fsm/prophet_model-20250723142736.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:36 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:36 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2z3mu9uq.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ghs_5hi3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80892', 'data', 'file=/tmp/tmprjkocm4m/2z3mu9uq.json', 'init=/tmp/tmprjkocm4m/ghs_5hi3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzmttl_kw/prophet_model-20250723142737.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:37 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:37 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0czxr6n3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/_n6wsd0y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6662', 'data', 'file=/tmp/tmprjkocm4m/0czxr6n3.json', 'init=/tmp/tmprjkocm4m/_n6wsd0y.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model25eiolvn/prophet_model-20250723142737.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:37 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:38 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/c9lghnac.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h21hf3u2.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75262', 'data', 'file=/tmp/tmprjkocm4m/c9lghnac.json', 'init=/tmp/tmprjkocm4m/h21hf3u2.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeleic9ik8d/prophet_model-20250723142738.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:38 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:38 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0xko49k2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wbtnovqr.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1437', 'data', 'file=/tmp/tmprjkocm4m/0xko49k2.json', 'init=/tmp/tmprjkocm4m/wbtnovqr.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelv5a_av8n/prophet_model-20250723142739.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:39 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:39 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
📊 Summary of Best Models per Country and Target:

      Country                  Target  ARIMA_RMSE  Prophet_RMSE   RF_RMSE 🎯 Best_Model
   Bangladesh Cardiovascular diseases    1.175582      6.991238  4.924493        ARIMA
   Bangladesh                Diabetes    0.000036      2.987844  0.101733        ARIMA
   Bangladesh         Life expectancy    2.312728      1.676697  2.298684      Prophet
       Brazil Cardiovascular diseases    1.819507      6.547227  3.512954        ARIMA
       Brazil                Diabetes    0.000000      0.186005  0.045713        ARIMA
       Brazil         Life expectancy    3.009573      2.189554  1.286215           RF
      Germany Cardiovascular diseases    0.433925      2.125500  0.950348        ARIMA
      Germany                Diabetes    0.000000      2.758175  0.000000        ARIMA
      Germany         Life expectancy    0.474573      0.612408  0.336656           RF
        India Cardiovascular diseases   19.662985     37.420988 47.551155        ARIMA
        India                Diabetes    0.019744      0.830592  0.001732           RF
        India         Life expectancy    1.973657      2.475751  2.190597        ARIMA
    Indonesia Cardiovascular diseases    8.486563      7.998086  0.097082           RF
    Indonesia                Diabetes    0.000000      0.712114  0.003464        ARIMA
    Indonesia         Life expectancy    1.887179      1.692886  1.644150           RF
        Japan Cardiovascular diseases    1.547668      7.688441  4.237571        ARIMA
        Japan                Diabetes    0.000000      1.841061  0.016166        ARIMA
        Japan         Life expectancy    0.638746      0.576474  0.319972           RF
        Kenya Cardiovascular diseases    0.121752      0.933468  0.799274        ARIMA
        Kenya                Diabetes    0.000379      3.479734  0.005196        ARIMA
        Kenya         Life expectancy    3.235337      1.670562  1.293366           RF
       Mexico Cardiovascular diseases    0.578806      0.843693  6.276441        ARIMA
       Mexico                Diabetes    0.000000      0.799705  0.412910        ARIMA
       Mexico         Life expectancy    6.224500      2.428620  2.490150      Prophet
      Nigeria Cardiovascular diseases    0.716350      4.498448  3.617701        ARIMA
      Nigeria                Diabetes    0.000000      0.140798  0.002708        ARIMA
      Nigeria         Life expectancy    0.700330      0.369290  1.244393      Prophet
United States Cardiovascular diseases    1.190369     11.974926 10.091925        ARIMA
United States                Diabetes    0.007983      0.489566  0.004000           RF
United States         Life expectancy    1.996910      1.561422  1.217660           RF

📋 Forecasts for Selected Years:

   Country                  Target  Year  ARIMA_RMSE  ARIMA_Forecast  Prophet_RMSE  Prophet_Forecast  RF_RMSE  RF_Forecast
Bangladesh Cardiovascular diseases  2025    1.175582       30.440474      6.991238         22.343115 4.924493    23.340201
Bangladesh Cardiovascular diseases  2030    1.175582       31.940794      6.991238         24.463361 4.924493    23.340201
Bangladesh Cardiovascular diseases  2040    1.175582       34.260180      6.991238         28.785528 4.924493    23.340201
Bangladesh Cardiovascular diseases  2050    1.175582       35.897214      6.991238         32.425885 4.924493    23.340201
Bangladesh Cardiovascular diseases  2060    1.175582       37.052640      6.991238         36.748052 4.924493    23.340201
Bangladesh                Diabetes  2025    0.000036        9.800033      2.987844          6.593908 0.101733     9.643000
Bangladesh                Diabetes  2030    0.000036        9.800026      2.987844          6.364059 0.101733     9.643000
Bangladesh                Diabetes  2040    0.000036        9.800027      2.987844          5.788692 0.101733     9.643000
Bangladesh                Diabetes  2050    0.000036        9.800027      2.987844          5.211562 0.101733     9.643000
Bangladesh                Diabetes  2060    0.000036        9.800027      2.987844          4.636195 0.101733     9.643000
Bangladesh         Life expectancy  2025    2.312728       71.671340      1.676697         76.010260 2.298684    71.741197
Bangladesh         Life expectancy  2030    2.312728       71.671358      1.676697         77.911348 2.298684    71.741197
Bangladesh         Life expectancy  2040    2.312728       71.671358      1.676697         83.766696 2.298684    71.741197
Bangladesh         Life expectancy  2050    2.312728       71.671358      1.676697         88.736171 2.298684    71.741197
Bangladesh         Life expectancy  2060    2.312728       71.671358      1.676697         94.591519 2.298684    71.741197
    Brazil Cardiovascular diseases  2025    1.819507       37.512576      6.547227         34.048560 3.512954    35.433321
    Brazil Cardiovascular diseases  2030    1.819507       38.026433      6.547227         37.240321 3.512954    35.433321
    Brazil Cardiovascular diseases  2040    1.819507       38.724635      6.547227         43.801768 3.512954    35.433321
    Brazil Cardiovascular diseases  2050    1.819507       39.136841      6.547227         49.309747 3.512954    35.433321
    Brazil Cardiovascular diseases  2060    1.819507       39.380199      6.547227         55.871194 3.512954    35.433321
    Brazil                Diabetes  2025    0.000000        8.300000      0.186005          8.233562 0.045713     8.348000
    Brazil                Diabetes  2030    0.000000        8.300000      0.186005          8.506609 0.045713     8.348000
    Brazil                Diabetes  2040    0.000000        8.300000      0.186005          8.908965 0.045713     8.348000
    Brazil                Diabetes  2050    0.000000        8.300000      0.186005          9.428119 0.045713     8.348000
    Brazil                Diabetes  2060    0.000000        8.300000      0.186005          9.830475 0.045713     8.348000
    Brazil         Life expectancy  2025    3.009573       69.422619      2.189554         77.316414 1.286215    73.905454
    Brazil         Life expectancy  2030    3.009573       65.412065      2.189554         78.606403 1.286215    73.905454
    Brazil         Life expectancy  2040    3.009573       59.751236      2.189554         81.065716 1.286215    73.905454
    Brazil         Life expectancy  2050    3.009573       56.227383      2.189554         83.708557 1.286215    73.905454
    Brazil         Life expectancy  2060    3.009573       54.033793      2.189554         86.167871 1.286215    73.905454

Evaluation metrics (RMSE, MAPE, R²)¶

In [ ]:
# Evaluation metrics (RMSE, MAPE, R²)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def calculate_metrics(actual, predicted):
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    return round(rmse, 4), round(mape, 2), round(r2, 4)

metrics_summary = []
eval_results = []  # Add this above your for-country loop to initialize the collector

# Evaluation years
eval_years = [2021, 2022, 2023]

for country in selected_countries:
    df_country = df_forecast_ready[df_forecast_ready['Country'] == country]

    for target in target_columns:
        if target not in df_country.columns:
            continue

        actual = df_country[df_country['Year'].isin(eval_years)][target].values

        # --- ARIMA ---
        try:
            train_series = df_country[df_country['Year'].between(1950, 2020)][[target]]
            train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
            model_arima = ARIMA(train_series, order=(1, 1, 1)).fit()
            arima_pred = model_arima.predict(start=len(train_series), end=len(train_series)+len(eval_years)-1)
            arima_rmse, arima_mape, arima_r2 = calculate_metrics(actual, arima_pred)
            metrics_summary.append({
                "Country": country, "Target": target, "Model": "ARIMA",
                "RMSE": arima_rmse, "MAPE": arima_mape, "R²": arima_r2
            })
        except:
            pass

        # --- Prophet ---
        try:
            prophet_df = df_country[df_country['Year'].between(1950, 2020)][['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
            model_prophet = Prophet()
            model_prophet.fit(prophet_df)
            future_eval = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
            prophet_pred = model_prophet.predict(future_eval)['yhat'].values
            prophet_rmse, prophet_mape, prophet_r2 = calculate_metrics(actual, prophet_pred)
            metrics_summary.append({
                "Country": country, "Target": target, "Model": "Prophet",
                "RMSE": prophet_rmse, "MAPE": prophet_mape, "R²": prophet_r2
            })
        except:
            pass

        # --- Random Forest ---
        try:
            features = selected_features_dict.get(target, [])
            available = [f for f in features if f in df_country.columns]
            X = df_country[available]
            y = df_country[target]
            X_train = X[df_country['Year'].between(1950, 2020)]
            y_train = y[df_country['Year'].between(1950, 2020)]
            X_eval = X[df_country['Year'].isin(eval_years)]
            model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
            model_rf.fit(X_train, y_train)
            rf_pred = model_rf.predict(X_eval)
            rf_rmse, rf_mape, rf_r2 = calculate_metrics(actual, rf_pred)

        # ✅ Add this block
            eval_rows = pd.DataFrame({
               "Country": [country] * len(eval_years),
               "Target": [target] * len(eval_years),
               "Year": eval_years,
               "Prediction": rf_pred,
               "Actual": actual
            })
            eval_results.append(eval_rows)

            metrics_summary.append({
                "Country": country, "Target": target, "Model": "Random Forest",
                "RMSE": rf_rmse, "MAPE": rf_mape, "R²": rf_r2
            })
        except:
            pass
df_eval_pred = pd.concat(eval_results, ignore_index=True)

def pick_best_model(group):
    return group.loc[group['RMSE'].idxmin(), 'Model']

# Convert to DataFrame
df_metrics = pd.DataFrame(metrics_summary)

# Sort it and assign it to df_metrics_sorted
df_metrics_sorted = df_metrics.sort_values(['Country', 'Target', 'Model']).reset_index(drop=True)

# Best model picker function
def pick_best_model(group):
    return group.loc[group['RMSE'].idxmin(), 'Model']

# Assign Best_Model using groupby and transform
df_metrics_sorted['Best_Model'] = df_metrics_sorted.groupby(['Country', 'Target'])['RMSE'].transform(
    lambda x: df_metrics_sorted.loc[x.idxmin(), 'Model']
)

# Display full table
print("\n🎯 Step 20: Evaluation Summary with Best Model\n")
print(df_metrics_sorted[['Country', 'Target', 'Model', 'RMSE', 'MAPE', 'R²', 'Best_Model']].to_string(index=False))

# Export summary
df_metrics_sorted.to_csv("df_metrics_sorted.csv", index=False)

# Download to your computer
from google.colab import files
files.download("df_metrics_sorted.csv")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/eqjjlcvx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/z2w7pcm_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=59368', 'data', 'file=/tmp/tmprjkocm4m/eqjjlcvx.json', 'init=/tmp/tmprjkocm4m/z2w7pcm_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelxyjgjdq5/prophet_model-20250723142739.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:39 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:40 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vg5i6vka.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lmka780h.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75109', 'data', 'file=/tmp/tmprjkocm4m/vg5i6vka.json', 'init=/tmp/tmprjkocm4m/lmka780h.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelh7h91zgo/prophet_model-20250723142741.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:41 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:41 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gmsvuhmu.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0y0ma29h.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19632', 'data', 'file=/tmp/tmprjkocm4m/gmsvuhmu.json', 'init=/tmp/tmprjkocm4m/0y0ma29h.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelkw7mltvl/prophet_model-20250723142742.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:42 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:43 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/smiy06t0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/riq4u0nn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=36038', 'data', 'file=/tmp/tmprjkocm4m/smiy06t0.json', 'init=/tmp/tmprjkocm4m/riq4u0nn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelczq1p405/prophet_model-20250723142743.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:43 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:44 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t008m8yn.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6dwjfhiq.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=99765', 'data', 'file=/tmp/tmprjkocm4m/t008m8yn.json', 'init=/tmp/tmprjkocm4m/6dwjfhiq.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelr_kg546m/prophet_model-20250723142744.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:44 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:45 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/oadqeb0v.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pk4m4u71.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=12168', 'data', 'file=/tmp/tmprjkocm4m/oadqeb0v.json', 'init=/tmp/tmprjkocm4m/pk4m4u71.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelq7u60i3_/prophet_model-20250723142746.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:46 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:46 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h40__qsi.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/n32vskud.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57212', 'data', 'file=/tmp/tmprjkocm4m/h40__qsi.json', 'init=/tmp/tmprjkocm4m/n32vskud.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelz3ogozpi/prophet_model-20250723142747.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:47 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:48 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gscmz__t.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ik9qkw_0.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25384', 'data', 'file=/tmp/tmprjkocm4m/gscmz__t.json', 'init=/tmp/tmprjkocm4m/ik9qkw_0.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldacp3582/prophet_model-20250723142749.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:49 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:49 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/7hpoi5td.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zn3uklyn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=93924', 'data', 'file=/tmp/tmprjkocm4m/7hpoi5td.json', 'init=/tmp/tmprjkocm4m/zn3uklyn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0j5cee4e/prophet_model-20250723142749.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:50 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:50 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8psco2ov.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tkhr35wn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=84665', 'data', 'file=/tmp/tmprjkocm4m/8psco2ov.json', 'init=/tmp/tmprjkocm4m/tkhr35wn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modell9z44mqp/prophet_model-20250723142750.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:50 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:51 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/spgmtiay.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/io0cpspk.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=3082', 'data', 'file=/tmp/tmprjkocm4m/spgmtiay.json', 'init=/tmp/tmprjkocm4m/io0cpspk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelvtbox0en/prophet_model-20250723142751.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:51 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:51 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/eo_te0ht.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vc7l2mzf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=60614', 'data', 'file=/tmp/tmprjkocm4m/eo_te0ht.json', 'init=/tmp/tmprjkocm4m/vc7l2mzf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelchrkwy9s/prophet_model-20250723142752.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:52 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:52 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pnk604ek.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/g732zgm_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=35287', 'data', 'file=/tmp/tmprjkocm4m/pnk604ek.json', 'init=/tmp/tmprjkocm4m/g732zgm_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeli6h6u0iz/prophet_model-20250723142752.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:52 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:53 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/b7vblsr6.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fr2_eiu4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=35398', 'data', 'file=/tmp/tmprjkocm4m/b7vblsr6.json', 'init=/tmp/tmprjkocm4m/fr2_eiu4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelmhphwbs7/prophet_model-20250723142755.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:55 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/r9g89p_f.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ijnrdngf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=54075', 'data', 'file=/tmp/tmprjkocm4m/r9g89p_f.json', 'init=/tmp/tmprjkocm4m/ijnrdngf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnxglfmc3/prophet_model-20250723142756.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:56 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:56 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ygbj5tne.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/l9fgdivj.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=54945', 'data', 'file=/tmp/tmprjkocm4m/ygbj5tne.json', 'init=/tmp/tmprjkocm4m/l9fgdivj.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelrv7d2fb0/prophet_model-20250723142756.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:56 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:57 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9460n9e5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/08bkdmuo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1255', 'data', 'file=/tmp/tmprjkocm4m/9460n9e5.json', 'init=/tmp/tmprjkocm4m/08bkdmuo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_8o8l24m/prophet_model-20250723142757.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:57 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:57 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8pdq6ykg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/28lfdde0.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=1282', 'data', 'file=/tmp/tmprjkocm4m/8pdq6ykg.json', 'init=/tmp/tmprjkocm4m/28lfdde0.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhlgecqdp/prophet_model-20250723142758.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:58 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:58 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tdpf1_54.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0dwo8sfe.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38022', 'data', 'file=/tmp/tmprjkocm4m/tdpf1_54.json', 'init=/tmp/tmprjkocm4m/0dwo8sfe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzxjdv7s4/prophet_model-20250723142758.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:58 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:59 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1p2t8ykc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vrwn2mde.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=28166', 'data', 'file=/tmp/tmprjkocm4m/1p2t8ykc.json', 'init=/tmp/tmprjkocm4m/vrwn2mde.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeluzp444cl/prophet_model-20250723142759.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:27:59 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:27:59 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/njpwbnzq.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/kt8aa3fc.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=58212', 'data', 'file=/tmp/tmprjkocm4m/njpwbnzq.json', 'init=/tmp/tmprjkocm4m/kt8aa3fc.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model92bmcut4/prophet_model-20250723142800.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:00 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6_vkip_j.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/o8grblo9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=4955', 'data', 'file=/tmp/tmprjkocm4m/6_vkip_j.json', 'init=/tmp/tmprjkocm4m/o8grblo9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0i3bmw6j/prophet_model-20250723142800.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:00 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lhv_eq_n.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/w9tg63fy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13290', 'data', 'file=/tmp/tmprjkocm4m/lhv_eq_n.json', 'init=/tmp/tmprjkocm4m/w9tg63fy.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model1kzgl0y3/prophet_model-20250723142801.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:01 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:01 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/isyciumg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e5lx5x4_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7456', 'data', 'file=/tmp/tmprjkocm4m/isyciumg.json', 'init=/tmp/tmprjkocm4m/e5lx5x4_.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeliabjxlt5/prophet_model-20250723142802.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:02 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gfse9eex.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/p41bc0hj.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18278', 'data', 'file=/tmp/tmprjkocm4m/gfse9eex.json', 'init=/tmp/tmprjkocm4m/p41bc0hj.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelasbnq6yo/prophet_model-20250723142802.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:02 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:03 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vxqrg_hg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8m_ipt_q.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=62453', 'data', 'file=/tmp/tmprjkocm4m/vxqrg_hg.json', 'init=/tmp/tmprjkocm4m/8m_ipt_q.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelentrb2vk/prophet_model-20250723142803.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:03 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/twg124ht.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/w5quimbo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=76939', 'data', 'file=/tmp/tmprjkocm4m/twg124ht.json', 'init=/tmp/tmprjkocm4m/w5quimbo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeljgyb62fy/prophet_model-20250723142804.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:04 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t62344g0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tisx2xkx.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79164', 'data', 'file=/tmp/tmprjkocm4m/t62344g0.json', 'init=/tmp/tmprjkocm4m/tisx2xkx.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model39mmf_st/prophet_model-20250723142804.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:04 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/b1h9unmc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hgcljw8a.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47767', 'data', 'file=/tmp/tmprjkocm4m/b1h9unmc.json', 'init=/tmp/tmprjkocm4m/hgcljw8a.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelfey3cwl9/prophet_model-20250723142805.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:05 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/adkcdrh2.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/evt78x3n.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=17956', 'data', 'file=/tmp/tmprjkocm4m/adkcdrh2.json', 'init=/tmp/tmprjkocm4m/evt78x3n.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpg39k48t/prophet_model-20250723142806.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:06 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
🎯 Step 20: Evaluation Summary with Best Model

      Country                  Target         Model    RMSE  MAPE            R²    Best_Model
   Bangladesh Cardiovascular diseases         ARIMA  1.1756  4.02 -1.094927e+29         ARIMA
   Bangladesh Cardiovascular diseases       Prophet  6.9912 24.69 -3.872468e+30         ARIMA
   Bangladesh Cardiovascular diseases Random Forest  4.9245 17.42 -1.921333e+30         ARIMA
   Bangladesh                Diabetes         ARIMA  0.0000  0.00  0.000000e+00         ARIMA
   Bangladesh                Diabetes       Prophet  2.9878 30.49  0.000000e+00         ARIMA
   Bangladesh                Diabetes Random Forest  0.1017  0.81  0.000000e+00         ARIMA
   Bangladesh         Life expectancy         ARIMA  2.3127  2.76 -1.102500e+00       Prophet
   Bangladesh         Life expectancy       Prophet  1.6767  1.89 -1.051000e-01       Prophet
   Bangladesh         Life expectancy Random Forest  2.2987  2.94 -1.077100e+00       Prophet
       Brazil Cardiovascular diseases         ARIMA  1.8195  4.66  0.000000e+00         ARIMA
       Brazil Cardiovascular diseases       Prophet  6.5472 16.73  0.000000e+00         ARIMA
       Brazil Cardiovascular diseases Random Forest  3.5130  9.02  0.000000e+00         ARIMA
       Brazil                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
       Brazil                Diabetes       Prophet  0.1860  2.14  0.000000e+00         ARIMA
       Brazil                Diabetes Random Forest  0.0457  0.55  0.000000e+00         ARIMA
       Brazil         Life expectancy         ARIMA  3.0096  3.29 -5.672800e+00 Random Forest
       Brazil         Life expectancy       Prophet  2.1896  2.66 -2.531900e+00 Random Forest
       Brazil         Life expectancy Random Forest  1.2862  1.52 -2.188000e-01 Random Forest
      Germany Cardiovascular diseases         ARIMA  0.4339  1.23  0.000000e+00         ARIMA
      Germany Cardiovascular diseases       Prophet  2.1255  5.82  0.000000e+00         ARIMA
      Germany Cardiovascular diseases Random Forest  0.9503  2.69  0.000000e+00         ARIMA
      Germany                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
      Germany                Diabetes       Prophet  2.7582 55.13  0.000000e+00         ARIMA
      Germany                Diabetes Random Forest  0.0000  0.00  1.000000e+00         ARIMA
      Germany         Life expectancy         ARIMA  0.4746  0.44 -1.051900e+00 Random Forest
      Germany         Life expectancy       Prophet  0.6124  0.65 -2.417000e+00 Random Forest
      Germany         Life expectancy Random Forest  0.3367  0.38 -3.260000e-02 Random Forest
        India Cardiovascular diseases         ARIMA 19.6630  6.68  0.000000e+00         ARIMA
        India Cardiovascular diseases       Prophet 37.4210 12.75  0.000000e+00         ARIMA
        India Cardiovascular diseases Random Forest 47.5512 16.61  0.000000e+00         ARIMA
        India                Diabetes         ARIMA  0.0197  0.21  0.000000e+00 Random Forest
        India                Diabetes       Prophet  0.8306  9.49  0.000000e+00 Random Forest
        India                Diabetes Random Forest  0.0017  0.01  0.000000e+00 Random Forest
        India         Life expectancy         ARIMA  1.9737  2.25  1.628000e-01         ARIMA
        India         Life expectancy       Prophet  2.4758  2.42 -3.173000e-01         ARIMA
        India         Life expectancy Random Forest  2.1906  2.96 -3.130000e-02         ARIMA
    Indonesia Cardiovascular diseases         ARIMA  8.4866 11.75  0.000000e+00 Random Forest
    Indonesia Cardiovascular diseases       Prophet  7.9981  9.90  0.000000e+00 Random Forest
    Indonesia Cardiovascular diseases Random Forest  0.0971  0.13  0.000000e+00 Random Forest
    Indonesia                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
    Indonesia                Diabetes       Prophet  0.7121  9.24  0.000000e+00         ARIMA
    Indonesia                Diabetes Random Forest  0.0035  0.03  0.000000e+00         ARIMA
    Indonesia         Life expectancy         ARIMA  1.8872  2.68 -2.444000e-01 Random Forest
    Indonesia         Life expectancy       Prophet  1.6929  1.48 -1.400000e-03 Random Forest
    Indonesia         Life expectancy Random Forest  1.6442  2.28  5.540000e-02 Random Forest
        Japan Cardiovascular diseases         ARIMA  1.5477  3.73  0.000000e+00         ARIMA
        Japan Cardiovascular diseases       Prophet  7.6884 18.56  0.000000e+00         ARIMA
        Japan Cardiovascular diseases Random Forest  4.2376 10.27  0.000000e+00         ARIMA
        Japan                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
        Japan                Diabetes       Prophet  1.8411 27.47  0.000000e+00         ARIMA
        Japan                Diabetes Random Forest  0.0162  0.14  0.000000e+00         ARIMA
        Japan         Life expectancy         ARIMA  0.6387  0.68 -4.204900e+00 Random Forest
        Japan         Life expectancy       Prophet  0.5765  0.59 -3.239500e+00 Random Forest
        Japan         Life expectancy Random Forest  0.3200  0.37 -3.061000e-01 Random Forest
        Kenya Cardiovascular diseases         ARIMA  0.1218  3.48 -7.516462e+28         ARIMA
        Kenya Cardiovascular diseases       Prophet  0.9335 26.66 -4.418335e+30         ARIMA
        Kenya Cardiovascular diseases Random Forest  0.7993 22.85 -3.239297e+30         ARIMA
        Kenya                Diabetes         ARIMA  0.0004  0.01  0.000000e+00         ARIMA
        Kenya                Diabetes       Prophet  3.4797 57.98  0.000000e+00         ARIMA
        Kenya                Diabetes Random Forest  0.0052  0.08  0.000000e+00         ARIMA
        Kenya         Life expectancy         ARIMA  3.2353  4.35 -7.360000e+00 Random Forest
        Kenya         Life expectancy       Prophet  1.6706  2.25 -1.228900e+00 Random Forest
        Kenya         Life expectancy Random Forest  1.2934  1.95 -3.360000e-01 Random Forest
       Mexico Cardiovascular diseases         ARIMA  0.5788  2.17 -2.654270e+28         ARIMA
       Mexico Cardiovascular diseases       Prophet  0.8437  3.08 -5.639601e+28         ARIMA
       Mexico Cardiovascular diseases Random Forest  6.2764 28.37 -3.121092e+30         ARIMA
       Mexico                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
       Mexico                Diabetes       Prophet  0.7997  7.13 -2.026747e+29         ARIMA
       Mexico                Diabetes Random Forest  0.4129  3.01 -5.403202e+28         ARIMA
       Mexico         Life expectancy         ARIMA  6.2245  7.05 -6.367500e+00       Prophet
       Mexico         Life expectancy       Prophet  2.4286  2.54 -1.216000e-01       Prophet
       Mexico         Life expectancy Random Forest  2.4902  3.34 -1.791000e-01       Prophet
      Nigeria Cardiovascular diseases         ARIMA  0.7164  3.98  0.000000e+00         ARIMA
      Nigeria Cardiovascular diseases       Prophet  4.4984 24.97  0.000000e+00         ARIMA
      Nigeria Cardiovascular diseases Random Forest  3.6177 20.11  0.000000e+00         ARIMA
      Nigeria                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
      Nigeria                Diabetes       Prophet  0.1408  2.06  0.000000e+00         ARIMA
      Nigeria                Diabetes Random Forest  0.0027  0.04  0.000000e+00         ARIMA
      Nigeria         Life expectancy         ARIMA  0.7003  1.17 -1.846100e+00       Prophet
      Nigeria         Life expectancy       Prophet  0.3693  0.58  2.086000e-01       Prophet
      Nigeria         Life expectancy Random Forest  1.2444  2.14 -7.985900e+00       Prophet
United States Cardiovascular diseases         ARIMA  1.1904  1.29  0.000000e+00         ARIMA
United States Cardiovascular diseases       Prophet 11.9749 12.93  0.000000e+00         ARIMA
United States Cardiovascular diseases Random Forest 10.0919 10.96  0.000000e+00         ARIMA
United States                Diabetes         ARIMA  0.0080  0.10  0.000000e+00 Random Forest
United States                Diabetes       Prophet  0.4896  6.65  0.000000e+00 Random Forest
United States                Diabetes Random Forest  0.0040  0.05  0.000000e+00 Random Forest
United States         Life expectancy         ARIMA  1.9969  2.10 -1.796800e+00 Random Forest
United States         Life expectancy       Prophet  1.5614  1.63 -7.100000e-01 Random Forest
United States         Life expectancy Random Forest  1.2177  1.39 -3.990000e-02 Random Forest
In [ ]:
# Evaluation metrics (RMSE, MAPE, R²)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def calculate_metrics(actual, predicted):
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    return round(rmse, 4), round(mape, 2), round(r2, 4)

metrics_summary = []

# Evaluation years
eval_years = [2021, 2022, 2023]

for country in selected_countries:
    df_country = df_forecast_ready[df_forecast_ready['Country'] == country]

    for target in target_columns:
        if target not in df_country.columns:
            continue

        actual = df_country[df_country['Year'].isin(eval_years)][target].values

        # --- ARIMA ---
        try:
            train_series = df_country[df_country['Year'].between(1950, 2020)][[target]]
            train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
            model_arima = ARIMA(train_series, order=(1, 1, 1)).fit()
            arima_pred = model_arima.predict(start=len(train_series), end=len(train_series)+len(eval_years)-1)
            arima_rmse, arima_mape, arima_r2 = calculate_metrics(actual, arima_pred)
            metrics_summary.append({
                "Country": country, "Target": target, "Model": "ARIMA",
                "RMSE": arima_rmse, "MAPE": arima_mape, "R²": arima_r2
            })
        except:
            pass

        # --- Prophet ---
        try:
            prophet_df = df_country[df_country['Year'].between(1950, 2020)][['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
            model_prophet = Prophet()
            model_prophet.fit(prophet_df)
            future_eval = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
            prophet_pred = model_prophet.predict(future_eval)['yhat'].values
            prophet_rmse, prophet_mape, prophet_r2 = calculate_metrics(actual, prophet_pred)
            metrics_summary.append({
                "Country": country, "Target": target, "Model": "Prophet",
                "RMSE": prophet_rmse, "MAPE": prophet_mape, "R²": prophet_r2
            })
        except:
            pass

        # --- Random Forest ---
        try:
            features = selected_features_dict.get(target, [])
            available = [f for f in features if f in df_country.columns]
            X = df_country[available]
            y = df_country[target]
            X_train = X[df_country['Year'].between(1950, 2020)]
            y_train = y[df_country['Year'].between(1950, 2020)]
            X_eval = X[df_country['Year'].isin(eval_years)]
            model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
            model_rf.fit(X_train, y_train)
            rf_pred = model_rf.predict(X_eval)
            rf_rmse, rf_mape, rf_r2 = calculate_metrics(actual, rf_pred)
            metrics_summary.append({
                "Country": country, "Target": target, "Model": "Random Forest",
                "RMSE": rf_rmse, "MAPE": rf_mape, "R²": rf_r2
            })
        except:
            pass

def pick_best_model(group):
    return group.loc[group['RMSE'].idxmin(), 'Model']

# Convert to DataFrame
df_metrics = pd.DataFrame(metrics_summary)

# Sort it and assign it to df_metrics_sorted
df_metrics_sorted = df_metrics.sort_values(['Country', 'Target', 'Model']).reset_index(drop=True)

# Best model picker function
def pick_best_model(group):
    return group.loc[group['RMSE'].idxmin(), 'Model']

# Assign Best_Model using groupby and transform
df_metrics_sorted['Best_Model'] = df_metrics_sorted.groupby(['Country', 'Target'])['RMSE'].transform(
    lambda x: df_metrics_sorted.loc[x.idxmin(), 'Model']
)

# Display full table
print("\n🎯 Step 20: Evaluation Summary with Best Model\n")
print(df_metrics_sorted[['Country', 'Target', 'Model', 'RMSE', 'MAPE', 'R²', 'Best_Model']].to_string(index=False))
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/96agqxb4.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a63jie00.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=58230', 'data', 'file=/tmp/tmprjkocm4m/96agqxb4.json', 'init=/tmp/tmprjkocm4m/a63jie00.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelrbxqmd3k/prophet_model-20250723142811.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:11 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/f7yn6mcf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vekzyhes.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=5483', 'data', 'file=/tmp/tmprjkocm4m/f7yn6mcf.json', 'init=/tmp/tmprjkocm4m/vekzyhes.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelutxoc49d/prophet_model-20250723142812.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:12 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:12 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/56jfux6d.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/nxnbae66.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=42415', 'data', 'file=/tmp/tmprjkocm4m/56jfux6d.json', 'init=/tmp/tmprjkocm4m/nxnbae66.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model78j27m6s/prophet_model-20250723142812.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:12 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9ph1dk0k.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/to_q09k2.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=80779', 'data', 'file=/tmp/tmprjkocm4m/9ph1dk0k.json', 'init=/tmp/tmprjkocm4m/to_q09k2.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3wcrj73h/prophet_model-20250723142813.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8lcv5y6z.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/kjesknro.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=15638', 'data', 'file=/tmp/tmprjkocm4m/8lcv5y6z.json', 'init=/tmp/tmprjkocm4m/kjesknro.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelcgb49lbt/prophet_model-20250723142814.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:14 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:14 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/75qvkpfx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/i2i8_56b.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=7959', 'data', 'file=/tmp/tmprjkocm4m/75qvkpfx.json', 'init=/tmp/tmprjkocm4m/i2i8_56b.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelp1rbzqxl/prophet_model-20250723142815.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:15 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/35m11l02.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2cf1fb8e.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45962', 'data', 'file=/tmp/tmprjkocm4m/35m11l02.json', 'init=/tmp/tmprjkocm4m/2cf1fb8e.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpjdmugoc/prophet_model-20250723142816.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:16 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6b03wzex.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/r6bgqq_5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=36355', 'data', 'file=/tmp/tmprjkocm4m/6b03wzex.json', 'init=/tmp/tmprjkocm4m/r6bgqq_5.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model01dxs0d2/prophet_model-20250723142818.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:19 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/h_0isjxz.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4666mwro.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=50895', 'data', 'file=/tmp/tmprjkocm4m/h_0isjxz.json', 'init=/tmp/tmprjkocm4m/4666mwro.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelujssn5hl/prophet_model-20250723142820.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q5094zuy.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/d21_bqlg.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=42067', 'data', 'file=/tmp/tmprjkocm4m/q5094zuy.json', 'init=/tmp/tmprjkocm4m/d21_bqlg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelavpi2gqq/prophet_model-20250723142822.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:22 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t5p7euuw.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4lqozamx.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47422', 'data', 'file=/tmp/tmprjkocm4m/t5p7euuw.json', 'init=/tmp/tmprjkocm4m/4lqozamx.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelygoqwes7/prophet_model-20250723142823.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:23 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tzl8hmsg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/aqj2c0s9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=54955', 'data', 'file=/tmp/tmprjkocm4m/tzl8hmsg.json', 'init=/tmp/tmprjkocm4m/aqj2c0s9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model34c4yai4/prophet_model-20250723142824.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:24 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:25 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1jdjv1ym.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/5ply91ys.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=16233', 'data', 'file=/tmp/tmprjkocm4m/1jdjv1ym.json', 'init=/tmp/tmprjkocm4m/5ply91ys.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeln8xdotuh/prophet_model-20250723142825.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tg6xi57o.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/em7pt54s.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=82535', 'data', 'file=/tmp/tmprjkocm4m/tg6xi57o.json', 'init=/tmp/tmprjkocm4m/em7pt54s.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modellbr_d7wm/prophet_model-20250723142827.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:27 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:28 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/56s2q0ui.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gy08iedo.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=60748', 'data', 'file=/tmp/tmprjkocm4m/56s2q0ui.json', 'init=/tmp/tmprjkocm4m/gy08iedo.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelg7jfvios/prophet_model-20250723142828.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:28 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/mu2y1iux.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/v7jgd4r2.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=9337', 'data', 'file=/tmp/tmprjkocm4m/mu2y1iux.json', 'init=/tmp/tmprjkocm4m/v7jgd4r2.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelku5nak1k/prophet_model-20250723142829.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:29 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:29 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jzvyvewp.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/mr48mr14.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=23976', 'data', 'file=/tmp/tmprjkocm4m/jzvyvewp.json', 'init=/tmp/tmprjkocm4m/mr48mr14.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model3lpo3ui7/prophet_model-20250723142829.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:29 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:30 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ofu8h322.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/k8n5hxd9.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=70178', 'data', 'file=/tmp/tmprjkocm4m/ofu8h322.json', 'init=/tmp/tmprjkocm4m/k8n5hxd9.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelxh05y0r4/prophet_model-20250723142830.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:30 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/g2xiz9hi.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/t5g5emio.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22075', 'data', 'file=/tmp/tmprjkocm4m/g2xiz9hi.json', 'init=/tmp/tmprjkocm4m/t5g5emio.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8cduqjex/prophet_model-20250723142831.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:31 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/m0rwqk4l.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xnpt149d.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77601', 'data', 'file=/tmp/tmprjkocm4m/m0rwqk4l.json', 'init=/tmp/tmprjkocm4m/xnpt149d.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeldubr4w1q/prophet_model-20250723142831.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:32 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e2it4bm5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jh2zfokd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=61228', 'data', 'file=/tmp/tmprjkocm4m/e2it4bm5.json', 'init=/tmp/tmprjkocm4m/jh2zfokd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelhr6l8_bh/prophet_model-20250723142833.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:33 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:33 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lnzx2fbb.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ledcbnmk.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22655', 'data', 'file=/tmp/tmprjkocm4m/lnzx2fbb.json', 'init=/tmp/tmprjkocm4m/ledcbnmk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeld9g72k_0/prophet_model-20250723142834.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:34 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:35 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/abygc97f.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3vxzg627.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=41273', 'data', 'file=/tmp/tmprjkocm4m/abygc97f.json', 'init=/tmp/tmprjkocm4m/3vxzg627.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpq_jed9k/prophet_model-20250723142835.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:35 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:35 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pwjgxwnc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qvr8h8ur.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=51176', 'data', 'file=/tmp/tmprjkocm4m/pwjgxwnc.json', 'init=/tmp/tmprjkocm4m/qvr8h8ur.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelukr7_ca3/prophet_model-20250723142835.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:35 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:36 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0e4wunt5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/x3p3rsx4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=22674', 'data', 'file=/tmp/tmprjkocm4m/0e4wunt5.json', 'init=/tmp/tmprjkocm4m/x3p3rsx4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzbgkl07q/prophet_model-20250723142836.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:36 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:37 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/lbc7c6mv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/msjjsc5w.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=91260', 'data', 'file=/tmp/tmprjkocm4m/lbc7c6mv.json', 'init=/tmp/tmprjkocm4m/msjjsc5w.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model17vi9bbj/prophet_model-20250723142837.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:37 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:37 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qjasgdr_.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/u1fq__1x.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=81508', 'data', 'file=/tmp/tmprjkocm4m/qjasgdr_.json', 'init=/tmp/tmprjkocm4m/u1fq__1x.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5d4_6ltf/prophet_model-20250723142838.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:38 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:38 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/0zuanv8u.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j0tr_zru.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=72026', 'data', 'file=/tmp/tmprjkocm4m/0zuanv8u.json', 'init=/tmp/tmprjkocm4m/j0tr_zru.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model4fjx7_g4/prophet_model-20250723142838.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:38 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:39 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/m8l9oi1q.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fosm3gdg.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75488', 'data', 'file=/tmp/tmprjkocm4m/m8l9oi1q.json', 'init=/tmp/tmprjkocm4m/fosm3gdg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelit1ljs_j/prophet_model-20250723142839.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:39 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:39 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/vdfsimm5.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/559dtkca.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=38508', 'data', 'file=/tmp/tmprjkocm4m/vdfsimm5.json', 'init=/tmp/tmprjkocm4m/559dtkca.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2fwcy8wp/prophet_model-20250723142839.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
14:28:39 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
14:28:40 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
🎯 Step 20: Evaluation Summary with Best Model

      Country                  Target         Model    RMSE  MAPE            R²    Best_Model
   Bangladesh Cardiovascular diseases         ARIMA  1.1756  4.02 -1.094927e+29         ARIMA
   Bangladesh Cardiovascular diseases       Prophet  6.9912 24.69 -3.872468e+30         ARIMA
   Bangladesh Cardiovascular diseases Random Forest  4.9245 17.42 -1.921333e+30         ARIMA
   Bangladesh                Diabetes         ARIMA  0.0000  0.00  0.000000e+00         ARIMA
   Bangladesh                Diabetes       Prophet  2.9878 30.49  0.000000e+00         ARIMA
   Bangladesh                Diabetes Random Forest  0.1017  0.81  0.000000e+00         ARIMA
   Bangladesh         Life expectancy         ARIMA  2.3127  2.76 -1.102500e+00       Prophet
   Bangladesh         Life expectancy       Prophet  1.6767  1.89 -1.051000e-01       Prophet
   Bangladesh         Life expectancy Random Forest  2.2987  2.94 -1.077100e+00       Prophet
       Brazil Cardiovascular diseases         ARIMA  1.8195  4.66  0.000000e+00         ARIMA
       Brazil Cardiovascular diseases       Prophet  6.5472 16.73  0.000000e+00         ARIMA
       Brazil Cardiovascular diseases Random Forest  3.5130  9.02  0.000000e+00         ARIMA
       Brazil                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
       Brazil                Diabetes       Prophet  0.1860  2.14  0.000000e+00         ARIMA
       Brazil                Diabetes Random Forest  0.0457  0.55  0.000000e+00         ARIMA
       Brazil         Life expectancy         ARIMA  3.0096  3.29 -5.672800e+00 Random Forest
       Brazil         Life expectancy       Prophet  2.1896  2.66 -2.531900e+00 Random Forest
       Brazil         Life expectancy Random Forest  1.2862  1.52 -2.188000e-01 Random Forest
      Germany Cardiovascular diseases         ARIMA  0.4339  1.23  0.000000e+00         ARIMA
      Germany Cardiovascular diseases       Prophet  2.1255  5.82  0.000000e+00         ARIMA
      Germany Cardiovascular diseases Random Forest  0.9503  2.69  0.000000e+00         ARIMA
      Germany                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
      Germany                Diabetes       Prophet  2.7582 55.13  0.000000e+00         ARIMA
      Germany                Diabetes Random Forest  0.0000  0.00  1.000000e+00         ARIMA
      Germany         Life expectancy         ARIMA  0.4746  0.44 -1.051900e+00 Random Forest
      Germany         Life expectancy       Prophet  0.6124  0.65 -2.417000e+00 Random Forest
      Germany         Life expectancy Random Forest  0.3367  0.38 -3.260000e-02 Random Forest
        India Cardiovascular diseases         ARIMA 19.6630  6.68  0.000000e+00         ARIMA
        India Cardiovascular diseases       Prophet 37.4210 12.75  0.000000e+00         ARIMA
        India Cardiovascular diseases Random Forest 47.5512 16.61  0.000000e+00         ARIMA
        India                Diabetes         ARIMA  0.0197  0.21  0.000000e+00 Random Forest
        India                Diabetes       Prophet  0.8306  9.49  0.000000e+00 Random Forest
        India                Diabetes Random Forest  0.0017  0.01  0.000000e+00 Random Forest
        India         Life expectancy         ARIMA  1.9737  2.25  1.628000e-01         ARIMA
        India         Life expectancy       Prophet  2.4758  2.42 -3.173000e-01         ARIMA
        India         Life expectancy Random Forest  2.1906  2.96 -3.130000e-02         ARIMA
    Indonesia Cardiovascular diseases         ARIMA  8.4866 11.75  0.000000e+00 Random Forest
    Indonesia Cardiovascular diseases       Prophet  7.9981  9.90  0.000000e+00 Random Forest
    Indonesia Cardiovascular diseases Random Forest  0.0971  0.13  0.000000e+00 Random Forest
    Indonesia                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
    Indonesia                Diabetes       Prophet  0.7121  9.24  0.000000e+00         ARIMA
    Indonesia                Diabetes Random Forest  0.0035  0.03  0.000000e+00         ARIMA
    Indonesia         Life expectancy         ARIMA  1.8872  2.68 -2.444000e-01 Random Forest
    Indonesia         Life expectancy       Prophet  1.6929  1.48 -1.400000e-03 Random Forest
    Indonesia         Life expectancy Random Forest  1.6442  2.28  5.540000e-02 Random Forest
        Japan Cardiovascular diseases         ARIMA  1.5477  3.73  0.000000e+00         ARIMA
        Japan Cardiovascular diseases       Prophet  7.6884 18.56  0.000000e+00         ARIMA
        Japan Cardiovascular diseases Random Forest  4.2376 10.27  0.000000e+00         ARIMA
        Japan                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
        Japan                Diabetes       Prophet  1.8411 27.47  0.000000e+00         ARIMA
        Japan                Diabetes Random Forest  0.0162  0.14  0.000000e+00         ARIMA
        Japan         Life expectancy         ARIMA  0.6387  0.68 -4.204900e+00 Random Forest
        Japan         Life expectancy       Prophet  0.5765  0.59 -3.239500e+00 Random Forest
        Japan         Life expectancy Random Forest  0.3200  0.37 -3.061000e-01 Random Forest
        Kenya Cardiovascular diseases         ARIMA  0.1218  3.48 -7.516462e+28         ARIMA
        Kenya Cardiovascular diseases       Prophet  0.9335 26.66 -4.418335e+30         ARIMA
        Kenya Cardiovascular diseases Random Forest  0.7993 22.85 -3.239297e+30         ARIMA
        Kenya                Diabetes         ARIMA  0.0004  0.01  0.000000e+00         ARIMA
        Kenya                Diabetes       Prophet  3.4797 57.98  0.000000e+00         ARIMA
        Kenya                Diabetes Random Forest  0.0052  0.08  0.000000e+00         ARIMA
        Kenya         Life expectancy         ARIMA  3.2353  4.35 -7.360000e+00 Random Forest
        Kenya         Life expectancy       Prophet  1.6706  2.25 -1.228900e+00 Random Forest
        Kenya         Life expectancy Random Forest  1.2934  1.95 -3.360000e-01 Random Forest
       Mexico Cardiovascular diseases         ARIMA  0.5788  2.17 -2.654270e+28         ARIMA
       Mexico Cardiovascular diseases       Prophet  0.8437  3.08 -5.639601e+28         ARIMA
       Mexico Cardiovascular diseases Random Forest  6.2764 28.37 -3.121092e+30         ARIMA
       Mexico                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
       Mexico                Diabetes       Prophet  0.7997  7.13 -2.026747e+29         ARIMA
       Mexico                Diabetes Random Forest  0.4129  3.01 -5.403202e+28         ARIMA
       Mexico         Life expectancy         ARIMA  6.2245  7.05 -6.367500e+00       Prophet
       Mexico         Life expectancy       Prophet  2.4286  2.54 -1.216000e-01       Prophet
       Mexico         Life expectancy Random Forest  2.4902  3.34 -1.791000e-01       Prophet
      Nigeria Cardiovascular diseases         ARIMA  0.7164  3.98  0.000000e+00         ARIMA
      Nigeria Cardiovascular diseases       Prophet  4.4984 24.97  0.000000e+00         ARIMA
      Nigeria Cardiovascular diseases Random Forest  3.6177 20.11  0.000000e+00         ARIMA
      Nigeria                Diabetes         ARIMA  0.0000  0.00  1.000000e+00         ARIMA
      Nigeria                Diabetes       Prophet  0.1408  2.06  0.000000e+00         ARIMA
      Nigeria                Diabetes Random Forest  0.0027  0.04  0.000000e+00         ARIMA
      Nigeria         Life expectancy         ARIMA  0.7003  1.17 -1.846100e+00       Prophet
      Nigeria         Life expectancy       Prophet  0.3693  0.58  2.086000e-01       Prophet
      Nigeria         Life expectancy Random Forest  1.2444  2.14 -7.985900e+00       Prophet
United States Cardiovascular diseases         ARIMA  1.1904  1.29  0.000000e+00         ARIMA
United States Cardiovascular diseases       Prophet 11.9749 12.93  0.000000e+00         ARIMA
United States Cardiovascular diseases Random Forest 10.0919 10.96  0.000000e+00         ARIMA
United States                Diabetes         ARIMA  0.0080  0.10  0.000000e+00 Random Forest
United States                Diabetes       Prophet  0.4896  6.65  0.000000e+00 Random Forest
United States                Diabetes Random Forest  0.0040  0.05  0.000000e+00 Random Forest
United States         Life expectancy         ARIMA  1.9969  2.10 -1.796800e+00 Random Forest
United States         Life expectancy       Prophet  1.5614  1.63 -7.100000e-01 Random Forest
United States         Life expectancy Random Forest  1.2177  1.39 -3.990000e-02 Random Forest

Forecast Comparison of ARIMA, Prophet, and Random Forest Across Three Targets¶

In [ ]:
# Forecast Comparison of ARIMA, Prophet, and Random Forest Across Three Targets
import matplotlib.pyplot as plt
import numpy as np

def plot_target_forecast(df_model_all, df_eval_ready, country, target):
    # Years
    full_years = list(range(1950, 2075))
    eval_years = [2021, 2022, 2023]
    forecast_start, forecast_end = 2024, 2074

    # === Actual values for 1950–2023
    df_actual = df_eval_ready[
        (df_eval_ready['Country'] == country) &
        (df_eval_ready['Year'].between(1950, 2023))
    ].sort_values('Year')
    actual_years = df_actual['Year'].values
    actual_vals = df_actual[target].values

    # === Predictions from df_model_comparison for 2021–2074
    df_pred = df_model_all[
        (df_model_all['Country'] == country) &
        (df_model_all['Target'] == target) &
        (df_model_all['Year'].between(2021, 2074))
    ].sort_values('Year')
    pred_years = df_pred['Year'].values
    rf_vals = df_pred['RF_Forecast'].values
    arima_vals = df_pred['ARIMA_Forecast'].values
    prophet_vals = df_pred['Prophet_Forecast'].values

    # Split prediction into eval + forecast ranges
    rf_eval, rf_forecast = [], []
    arima_eval, arima_forecast = [], []
    prophet_eval, prophet_forecast = [], []

    for yr, rf, ar, pr in zip(pred_years, rf_vals, arima_vals, prophet_vals):
        if yr in eval_years:
            rf_eval.append((yr, rf))
            arima_eval.append((yr, ar))
            prophet_eval.append((yr, pr))
        else:
            rf_forecast.append((yr, rf))
            arima_forecast.append((yr, ar))
            prophet_forecast.append((yr, pr))

    # Begin plot
    plt.figure(figsize=(14, 6))

    # Shaded forecast area
    plt.axvspan(forecast_start, forecast_end, color='gray', alpha=0.12, label="Forecast Horizon")

    # Actual line
    plt.plot(actual_years, actual_vals, label="Actual", color='orange', linewidth=2)

    # Prediction lines (2021–2023)
    if rf_eval: plt.plot(*zip(*rf_eval), label="RF Eval", color='dodgerblue', linestyle='dashed', linewidth=2)
    if arima_eval: plt.plot(*zip(*arima_eval), label="ARIMA Eval", color='forestgreen', linestyle='dashed', linewidth=2)
    if prophet_eval: plt.plot(*zip(*prophet_eval), label="Prophet Eval", color='darkorchid', linestyle='dashed', linewidth=2)

    # Forecast lines (2024–2074)
    if rf_forecast: plt.plot(*zip(*rf_forecast), label="RF Forecast", color='dodgerblue', linewidth=2)
    if arima_forecast: plt.plot(*zip(*arima_forecast), label="ARIMA Forecast", color='forestgreen', linewidth=2)
    if prophet_forecast: plt.plot(*zip(*prophet_forecast), label="Prophet Forecast", color='darkorchid', linewidth=2)

    # Final plot touches
    plt.title(f"{target} — Actual, Evaluation & Forecast Comparison ({country})", fontsize=16)
    plt.xlabel("Year")
    plt.ylabel("Value")
    plt.grid(True)
    plt.legend()
    plt.xlim(1950, 2074)
    plt.tight_layout()
    plt.show()

selected_countries = [
    'United States', 'Germany', 'Japan', 'Brazil', 'India',
    'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
selected_targets = ["Life expectancy", "Diabetes", "Cardiovascular diseases"]

for country in selected_countries:
    for target in selected_targets:
        plot_target_forecast(df_model_comparison, df_forecast_ready, country, target)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Plot comparison for 4 countries

import matplotlib.pyplot as plt
import seaborn as sns

# Countries and years to plot
countries_to_plot = ['United States', 'Mexico', 'India', 'Japan']
years_to_plot = [2021, 2022, 2023]

# combine all result into one dataframe
val_results = pd.concat([
    pd.concat(arima_val_all, ignore_index=True),
    pd.concat(prophet_val_all, ignore_index=True),
    pd.concat(rf_val_all, ignore_index=True)
], ignore_index=True)

val_results['Model'] = val_results['Model'].replace({'RandomForest': 'Random Forest'})

# Filter validation results for these countries and years
plot_df = val_results[
    (val_results['Country'].isin(countries_to_plot)) &
    (val_results['Year'].isin(years_to_plot))
].copy()

# Example for one target variable, say target = 'Cardiovascular diseases'
target_of_interest = 'Cardiovascular diseases'
plot_df = plot_df[plot_df['Target'] == target_of_interest]

# Set seaborn style
sns.set(style="whitegrid")

# Create a separate plot for each country with actual vs predicted lines for each model
fig, axs = plt.subplots(2, 2, figsize=(16, 10), sharey=True)
axs = axs.flatten()

for i, country in enumerate(countries_to_plot):
    ax = axs[i]
    country_data = plot_df[plot_df['Country'] == country]

    # Plot Actual values
    actual_data = country_data[['Year', 'Actual']].drop_duplicates()
    ax.plot(actual_data['Year'], actual_data['Actual'], label='Actual', color='black', marker='o')

    # Plot Forecasts from each model
    for model in country_data['Model'].unique():
        model_data = country_data[country_data['Model'] == model]
        ax.plot(model_data['Year'], model_data['Forecast'], label=f'Forecast ({model})', marker='x')

    ax.set_title(f'{country} - Actual vs Predicted ({target_of_interest})')
    ax.set_xlabel('Year')
    ax.set_ylabel('Value')
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
# Plot Acutal vs Predict (RF, ARIMA, Prophet )

# Plots Testing- Actual vs Predict (RF, ARIMA, Prophet) -    18 July

import matplotlib.pyplot as plt
import numpy as np

def plot_target_forecast(df, country, target):
    # Filter data for country & target
    df_ct = df[(df['Country'] == country) & (df['Target'] == target)].sort_values('Year')

    # Extract data
    years = df_ct['Year']
    arima = df_ct['ARIMA_Forecast']
    rf = df_ct['RF_Forecast']
    prophet = df_ct['Prophet_Forecast']

    # Actual years
    actual_years = [2021, 2022, 2023]
    forecast_years = list(range(2024, 2075))
    actual_mask = df_ct['Year'].isin(actual_years)

    # Use ARIMA prediction as proxy for observed if needed
    actual_vals = arima[actual_mask]

    # Start plot
    plt.figure(figsize=(13, 6))

    # Forecast region shading
    plt.axvspan(2024, 2074, color='gray', alpha=0.12, label='Forecast Horizon')

    # Plot forecasts
    plt.plot(years, arima, color='forestgreen', linewidth=2, label='ARIMA Forecast')
    plt.plot(years, prophet, color='darkorchid', linewidth=2, label='Prophet Forecast')
    plt.plot(years, rf, color='navy', linewidth=2, label='Random Forest Forecast')

    # Plot actual values
    plt.scatter(df_ct.loc[actual_mask, 'Year'], actual_vals,
                color='orange', edgecolor='black', s=90,
                label='Observed (2021–2023)', zorder=5)

    # Final touches
    plt.title(f"{target} Forecast — {country}", fontsize=16)
    plt.xlabel("Year")
    plt.ylabel("Value")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    selected_countries = [
    'United States', 'Germany', 'Japan', 'Brazil', 'India',
    'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]

for country in selected_countries:
    for target in ["Life expectancy", "Diabetes", "Cardiovascular diseases"]:
        plot_target_forecast(df_model_comparison, country, target)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
# Plots Testing- Actual vs Predict (RF, ARIMA, Prophet)

forecast_summary = []

for country in selected_countries:
    df_country = df_forecast_ready[df_forecast_ready['Country'] == country].sort_values('Year')

    for target in target_columns:
        if target not in df_country.columns:
            continue

        features = selected_features_dict.get(target, [])
        available_features = [f for f in features if f in df_country.columns]
        if not available_features:
            continue

        df_train = df_country[df_country['Year'].between(start_train, end_train)]
        df_eval = df_country[df_country['Year'].isin(eval_years)]
        df_forecast = df_country[df_country['Year'].isin(forecast_horizon)]
        actual_eval = df_eval[target].values

        #### ARIMA ####
        arima_rmse, arima_forecast_eval, arima_forecast = None, [], []
        try:
            train_series = df_train[[target]].copy()
            train_series.index = pd.date_range(start='1950', periods=len(train_series), freq='YE')
            model = ARIMA(train_series, order=(1, 1, 1)).fit()
            pred_eval_arima = model.predict(start=len(train_series), end=len(train_series)+len(df_eval)-1)
            arima_rmse = np.sqrt(mean_squared_error(actual_eval, pred_eval_arima))
            arima_forecast_eval = pred_eval_arima.tolist()
            arima_forecast = model.predict(start=len(train_series)+len(df_eval),
                                           end=len(train_series)+len(df_eval)+len(df_forecast)-1).tolist()
        except:
            pass

        #### Prophet ####
        prophet_rmse, prophet_forecast_eval, prophet_forecast = None, [], []
        try:
            prophet_df = df_train[['Year', target]].rename(columns={'Year': 'ds', target: 'y'})
            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'], format='%Y')
            model = Prophet()
            model.fit(prophet_df)
            eval_dates = pd.DataFrame({'ds': pd.to_datetime(eval_years, format='%Y')})
            forecast_eval_prophet = model.predict(eval_dates)
            prophet_rmse = np.sqrt(mean_squared_error(actual_eval, forecast_eval_prophet['yhat'].values))
            prophet_forecast_eval = forecast_eval_prophet['yhat'].tolist()
            forecast_years_df = pd.DataFrame({'ds': pd.to_datetime(df_forecast['Year'], format='%Y')})
            prophet_forecast = model.predict(forecast_years_df)['yhat'].tolist()
        except:
            pass

        #### RF ####
        rf_rmse, rf_forecast_eval, rf_forecast = None, [], []
        try:
            X = df_country[available_features]
            y = df_country[target]
            X_train = X[df_country['Year'].between(start_train, end_train)]
            y_train = y[df_country['Year'].between(start_train, end_train)]
            X_eval = X[df_country['Year'].isin(eval_years)]
            y_eval = y[df_country['Year'].isin(eval_years)]
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)
            pred_eval_rf = model.predict(X_eval)
            rf_rmse = np.sqrt(mean_squared_error(y_eval, pred_eval_rf))
            rf_forecast_eval = pred_eval_rf.tolist()
            X_forecast = X[df_country['Year'].isin(forecast_horizon)]
            rf_forecast = model.predict(X_forecast).tolist() if not X_forecast.isnull().any(axis=1).any() else [None]*len(X_forecast)
        except:
            pass

        # Append evaluation predictions
        for i, year in enumerate(eval_years):
            forecast_summary.append({
                "Country": country,
                "Target": target,
                "Year": year,
                "ARIMA_RMSE": arima_rmse,
                "ARIMA_Forecast": arima_forecast_eval[i] if i < len(arima_forecast_eval) else None,
                "Prophet_RMSE": prophet_rmse,
                "Prophet_Forecast": prophet_forecast_eval[i] if i < len(prophet_forecast_eval) else None,
                "RF_RMSE": rf_rmse,
                "RF_Forecast": rf_forecast_eval[i] if i < len(rf_forecast_eval) else None
            })

        # Append future forecast predictions
        for i, year in enumerate(df_forecast['Year']):
            forecast_summary.append({
                "Country": country,
                "Target": target,
                "Year": year,
                "ARIMA_RMSE": arima_rmse,
                "ARIMA_Forecast": arima_forecast[i] if i < len(arima_forecast) else None,
                "Prophet_RMSE": prophet_rmse,
                "Prophet_Forecast": prophet_forecast[i] if i < len(prophet_forecast) else None,
                "RF_RMSE": rf_rmse,
                "RF_Forecast": rf_forecast[i] if i < len(rf_forecast) else None
            })

df_model_comparison = pd.DataFrame(forecast_summary).sort_values(["Country", "Target", "Year"])

def plot_target_forecast(df_model_all, df_eval_ready, country, target):
    eval_years = [2021, 2022, 2023]
    forecast_years = list(range(2024, 2075))
    full_years = eval_years + forecast_years

    df_actual = df_eval_ready[
        (df_eval_ready['Country'] == country) &
        (df_eval_ready['Year'].isin(eval_years))
    ][['Year', target]].sort_values('Year')

    df_plot = df_model_all[
        (df_model_all['Country'] == country) &
        (df_model_all['Target'] == target) &
        (df_model_all['Year'].isin(full_years))
    ].sort_values('Year')

    years = df_plot['Year'].values
    rf_vals = df_plot['RF_Forecast'].values
    arima_vals = df_plot['ARIMA_Forecast'].values
    prophet_vals = df_plot['Prophet_Forecast'].values

    # Build actual line
    actual_line = []
    for yr in years:
        val = df_actual[df_actual['Year'] == yr][target]
        actual_line.append(val.values[0] if not val.empty else np.nan)

    # Plot
    plt.figure(figsize=(13, 6))
    plt.axvspan(2024, 2074, color='gray', alpha=0.12, label='Forecast Horizon')
    plt.plot(years, actual_line, label="🟧 Actual", color='orange', linewidth=2)
    plt.plot(years, rf_vals, label="🔵 RF Prediction", color='dodgerblue', linewidth=2)
    plt.plot(years, arima_vals, label="🟩 ARIMA Prediction", color='forestgreen', linewidth=2)
    plt.plot(years, prophet_vals, label="🟣 Prophet Prediction", color='darkorchid', linewidth=2)
    plt.title(f"{target} — Actual & Forecast Comparison ({country})", fontsize=16)
    plt.xlabel("Year")
    plt.ylabel("Value")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


selected_countries = [
    'United States', 'Germany', 'Japan', 'Brazil', 'India',
    'Indonesia', 'Nigeria', 'Kenya', 'Mexico', 'Bangladesh'
]
selected_targets = ["Life expectancy", "Diabetes", "Cardiovascular diseases"]

for country in selected_countries:
    for target in selected_targets:
        plot_target_forecast(df_model_comparison, df_forecast_ready, country, target)
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/xcn15p9f.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/fq7s2wc4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=56323', 'data', 'file=/tmp/tmprjkocm4m/xcn15p9f.json', 'init=/tmp/tmprjkocm4m/fq7s2wc4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelv0_il2me/prophet_model-20250723150106.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:06 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:08 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3i0cknr4.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/n0ji9b3r.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=90929', 'data', 'file=/tmp/tmprjkocm4m/3i0cknr4.json', 'init=/tmp/tmprjkocm4m/n0ji9b3r.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelg7_3e6i4/prophet_model-20250723150110.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:10 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:10 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/94b6i4ty.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/wiku0dsr.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=97008', 'data', 'file=/tmp/tmprjkocm4m/94b6i4ty.json', 'init=/tmp/tmprjkocm4m/wiku0dsr.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelh4lwn385/prophet_model-20250723150111.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:11 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:12 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/qg7p85e3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4sctqmvy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=44341', 'data', 'file=/tmp/tmprjkocm4m/qg7p85e3.json', 'init=/tmp/tmprjkocm4m/4sctqmvy.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelw54y1lrb/prophet_model-20250723150112.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:12 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jhcw5fgj.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gee4nwbv.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=89937', 'data', 'file=/tmp/tmprjkocm4m/jhcw5fgj.json', 'init=/tmp/tmprjkocm4m/gee4nwbv.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelpy6sfrua/prophet_model-20250723150113.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:13 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:13 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/hya61vxx.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gqd80ecs.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=95356', 'data', 'file=/tmp/tmprjkocm4m/hya61vxx.json', 'init=/tmp/tmprjkocm4m/gqd80ecs.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeltqfulq0n/prophet_model-20250723150114.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:14 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:14 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ko5xb_ld.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/41l5k7jf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=6632', 'data', 'file=/tmp/tmprjkocm4m/ko5xb_ld.json', 'init=/tmp/tmprjkocm4m/41l5k7jf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelwh4bbhju/prophet_model-20250723150115.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:15 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:15 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/brryefbv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/79kzkg23.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45981', 'data', 'file=/tmp/tmprjkocm4m/brryefbv.json', 'init=/tmp/tmprjkocm4m/79kzkg23.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8a83zuam/prophet_model-20250723150115.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:15 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/1vtpzvl4.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/zo3hc0yn.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27575', 'data', 'file=/tmp/tmprjkocm4m/1vtpzvl4.json', 'init=/tmp/tmprjkocm4m/zo3hc0yn.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelyvq_dlwd/prophet_model-20250723150116.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:16 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:16 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e7rkf5ty.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/iqbldoj4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=39930', 'data', 'file=/tmp/tmprjkocm4m/e7rkf5ty.json', 'init=/tmp/tmprjkocm4m/iqbldoj4.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model2c3lh1fk/prophet_model-20250723150117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/yq_o5a0j.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q586xxt7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=49176', 'data', 'file=/tmp/tmprjkocm4m/yq_o5a0j.json', 'init=/tmp/tmprjkocm4m/q586xxt7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model47zxobpr/prophet_model-20250723150117.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:17 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:18 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4a97gqcs.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/b2zn8__y.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25471', 'data', 'file=/tmp/tmprjkocm4m/4a97gqcs.json', 'init=/tmp/tmprjkocm4m/b2zn8__y.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_33hc8lw/prophet_model-20250723150118.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:18 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:18 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/buq0wfdv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/en47nn7i.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=33930', 'data', 'file=/tmp/tmprjkocm4m/buq0wfdv.json', 'init=/tmp/tmprjkocm4m/en47nn7i.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model05mkl102/prophet_model-20250723150119.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:19 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:19 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/6k7ywp4j.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/s5jjzobg.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=4964', 'data', 'file=/tmp/tmprjkocm4m/6k7ywp4j.json', 'init=/tmp/tmprjkocm4m/s5jjzobg.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model_qx7pexn/prophet_model-20250723150120.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:20 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4tykcuex.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4uz180rf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=19771', 'data', 'file=/tmp/tmprjkocm4m/4tykcuex.json', 'init=/tmp/tmprjkocm4m/4uz180rf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modeleq3wjnzq/prophet_model-20250723150121.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:21 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/dub9k5jc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/a_8fsjbb.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27947', 'data', 'file=/tmp/tmprjkocm4m/dub9k5jc.json', 'init=/tmp/tmprjkocm4m/a_8fsjbb.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model8cz227c3/prophet_model-20250723150122.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:22 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:23 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/e2sg94bz.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ma6e3h6o.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=59076', 'data', 'file=/tmp/tmprjkocm4m/e2sg94bz.json', 'init=/tmp/tmprjkocm4m/ma6e3h6o.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modellx89hbtk/prophet_model-20250723150123.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:23 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2foc1vzw.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3p1hzrs8.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25301', 'data', 'file=/tmp/tmprjkocm4m/2foc1vzw.json', 'init=/tmp/tmprjkocm4m/3p1hzrs8.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelnfinlbmz/prophet_model-20250723150124.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:24 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:24 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4w6q0v9n.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ttzzynt3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=40853', 'data', 'file=/tmp/tmprjkocm4m/4w6q0v9n.json', 'init=/tmp/tmprjkocm4m/ttzzynt3.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5g2l9frw/prophet_model-20250723150125.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:25 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/q6sitoku.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jxnz0p82.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=45329', 'data', 'file=/tmp/tmprjkocm4m/q6sitoku.json', 'init=/tmp/tmprjkocm4m/jxnz0p82.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelbomqvztx/prophet_model-20250723150125.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:25 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sxj4mx1i.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/j3ff4el7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=55135', 'data', 'file=/tmp/tmprjkocm4m/sxj4mx1i.json', 'init=/tmp/tmprjkocm4m/j3ff4el7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model0a2p2g90/prophet_model-20250723150126.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:26 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/gpjbd4kf.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/4be0jlhe.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=64708', 'data', 'file=/tmp/tmprjkocm4m/gpjbd4kf.json', 'init=/tmp/tmprjkocm4m/4be0jlhe.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelj_13ox_d/prophet_model-20250723150127.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:27 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:27 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/tyabzpsb.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/dsimj0a7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=75251', 'data', 'file=/tmp/tmprjkocm4m/tyabzpsb.json', 'init=/tmp/tmprjkocm4m/dsimj0a7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model416aph0l/prophet_model-20250723150128.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:28 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/8eok89ec.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ngwetz9c.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=13514', 'data', 'file=/tmp/tmprjkocm4m/8eok89ec.json', 'init=/tmp/tmprjkocm4m/ngwetz9c.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model6ra7svlu/prophet_model-20250723150128.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:29 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ctysfzxc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/py2f0mie.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=79130', 'data', 'file=/tmp/tmprjkocm4m/ctysfzxc.json', 'init=/tmp/tmprjkocm4m/py2f0mie.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelfzuldb6m/prophet_model-20250723150129.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:29 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:30 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/sorq_sl7.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/3cpdd1hk.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=48566', 'data', 'file=/tmp/tmprjkocm4m/sorq_sl7.json', 'init=/tmp/tmprjkocm4m/3cpdd1hk.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model5qngdowd/prophet_model-20250723150130.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:30 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:30 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ts10rp61.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/9xnd_9dl.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=2376', 'data', 'file=/tmp/tmprjkocm4m/ts10rp61.json', 'init=/tmp/tmprjkocm4m/9xnd_9dl.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_model4cm1tcdf/prophet_model-20250723150131.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:31 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/ki_scwy0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/pvgasxyf.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=23559', 'data', 'file=/tmp/tmprjkocm4m/ki_scwy0.json', 'init=/tmp/tmprjkocm4m/pvgasxyf.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelzzp4uuj8/prophet_model-20250723150131.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:32 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/jg311jj0.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/2bsjz2e7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=9497', 'data', 'file=/tmp/tmprjkocm4m/jg311jj0.json', 'init=/tmp/tmprjkocm4m/2bsjz2e7.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelqgx6y8ti/prophet_model-20250723150132.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:32 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:32 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/v5h3o2i9.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmprjkocm4m/mpabi3zd.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=27575', 'data', 'file=/tmp/tmprjkocm4m/v5h3o2i9.json', 'init=/tmp/tmprjkocm4m/mpabi3zd.json', 'output', 'file=/tmp/tmprjkocm4m/prophet_modelj5nlrp04/prophet_model-20250723150133.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
15:01:33 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
15:01:33 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image